LLVM 22.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47// This is intended for debugging purposes only.
49 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
50 cl::desc("Insert a s_nop x before every instruction"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
69 EmittedInstrs.clear();
70}
71
74}
75
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
120 const MachineInstr &MI) {
121 if (TII.isAlwaysGDS(MI.getOpcode()))
122 return true;
123
124 switch (MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
128 return true;
129 // These DS opcodes don't support GDS.
130 case AMDGPU::DS_NOP:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
133 return false;
134 default:
135 if (TII.isDS(MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (MI.getOperand(GDS).getImm())
139 return true;
140 }
141 return false;
142 }
143}
144
145static bool isPermlane(const MachineInstr &MI) {
146 unsigned Opcode = MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
161}
162
163static bool isLdsDma(const MachineInstr &MI) {
164 return SIInstrInfo::isVALU(MI) &&
166}
167
168static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
169 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
170 AMDGPU::OpName::simm16);
171 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
172}
173
176 MachineInstr *MI = SU->getInstr();
177 // If we are not in "HazardRecognizerMode" and therefore not being run from
178 // the scheduler, track possible stalls from hazards but don't insert noops.
179 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
180
181 if (MI->isBundle())
182 return NoHazard;
183
184 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
185 return HazardType;
186
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
188 return HazardType;
189
190 if (checkFPAtomicToDenormModeHazard(MI) > 0)
191 return HazardType;
192
193 if (ST.hasNoDataDepHazard())
194 return NoHazard;
195
196 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
197 return HazardType;
198
199 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
200 return HazardType;
201
202 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
203 return HazardType;
204
205 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
206 return HazardType;
207
208 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
209 return HazardType;
210
213 checkMAIVALUHazards(MI) > 0)
214 return HazardType;
215
216 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
217 return HazardType;
218
219 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
220 return HazardType;
221
222 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
223 return HazardType;
224
225 if (((ST.hasReadM0MovRelInterpHazard() &&
226 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
227 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
228 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
230 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
232 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
233 checkReadM0Hazards(MI) > 0)
234 return HazardType;
235
236 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
237 return HazardType;
238
240 checkMAILdStHazards(MI) > 0)
241 return HazardType;
242
243 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
244 return HazardType;
245
246 return NoHazard;
247}
248
250 unsigned Quantity) {
251 while (Quantity > 0) {
252 unsigned Arg = std::min(Quantity, 8u);
253 Quantity -= Arg;
254 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
255 .addImm(Arg - 1);
256 }
257}
258
259unsigned
260GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
261 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
262 assert(TSchedModel.getWriteProcResBegin(SC) !=
263 TSchedModel.getWriteProcResEnd(SC));
264 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
265}
266
267void GCNHazardRecognizer::processBundle() {
268 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
270 // Check bundled MachineInstr's for hazards.
271 for (; MI != E && MI->isInsideBundle(); ++MI) {
272 CurrCycleInstr = &*MI;
273 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
274
275 if (IsHazardRecognizerMode) {
276 fixHazards(CurrCycleInstr);
277
278 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
279 }
280
281 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
282 // include the bundled MI directly after, only add a maximum of
283 // (MaxLookAhead - 1) noops to EmittedInstrs.
284 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
285 EmittedInstrs.push_front(nullptr);
286
287 EmittedInstrs.push_front(CurrCycleInstr);
288 EmittedInstrs.resize(MaxLookAhead);
289 }
290 CurrCycleInstr = nullptr;
291}
292
293void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
294 assert(IsHazardRecognizerMode);
295
296 unsigned NumPreNoops = PreEmitNoops(MI);
297 EmitNoops(NumPreNoops);
298 if (MI->isInsideBundle())
299 insertNoopsInBundle(MI, TII, NumPreNoops);
300 else
301 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
302 NumPreNoops);
304 AdvanceCycle();
305}
306
308 IsHazardRecognizerMode = true;
309 CurrCycleInstr = MI;
310 unsigned W = PreEmitNoopsCommon(MI);
311 fixHazards(MI);
312 CurrCycleInstr = nullptr;
313 return std::max(W, NopPadding.getValue());
314}
315
317 if (MI->isBundle())
318 return 0;
319
320 int WaitStates = 0;
321
323 return std::max(WaitStates, checkSMRDHazards(MI));
324
325 if (ST.hasNSAtoVMEMBug())
326 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
327
328 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
329
330 if (ST.hasNoDataDepHazard())
331 return WaitStates;
332
334 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
335
337 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
338
340 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
341
342 if (isDivFMas(MI->getOpcode()))
343 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
344
345 if (isRWLane(MI->getOpcode()))
346 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
347
350 checkMAIVALUHazards(MI) > 0)
351 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
352
353 if (MI->isInlineAsm())
354 return std::max(WaitStates, checkInlineAsmHazards(MI));
355
356 if (isSGetReg(MI->getOpcode()))
357 return std::max(WaitStates, checkGetRegHazards(MI));
358
359 if (isSSetReg(MI->getOpcode()))
360 return std::max(WaitStates, checkSetRegHazards(MI));
361
362 if (isRFE(MI->getOpcode()))
363 return std::max(WaitStates, checkRFEHazards(MI));
364
365 if ((ST.hasReadM0MovRelInterpHazard() &&
366 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
367 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
368 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
370 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
372 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
373 return std::max(WaitStates, checkReadM0Hazards(MI));
374
376 return std::max(WaitStates, checkMAIHazards(MI));
377
379 return std::max(WaitStates, checkMAILdStHazards(MI));
380
381 if (ST.hasGFX950Insts() && isPermlane(*MI))
382 return std::max(WaitStates, checkPermlaneHazards(MI));
383
384 return WaitStates;
385}
386
388 EmittedInstrs.push_front(nullptr);
389}
390
392 // When the scheduler detects a stall, it will call AdvanceCycle() without
393 // emitting any instructions.
394 if (!CurrCycleInstr) {
395 EmittedInstrs.push_front(nullptr);
396 return;
397 }
398
399 if (CurrCycleInstr->isBundle()) {
400 processBundle();
401 return;
402 }
403
404 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
405 if (!NumWaitStates) {
406 CurrCycleInstr = nullptr;
407 return;
408 }
409
410 // Keep track of emitted instructions
411 EmittedInstrs.push_front(CurrCycleInstr);
412
413 // Add a nullptr for each additional wait state after the first. Make sure
414 // not to add more than getMaxLookAhead() items to the list, since we
415 // truncate the list to that size right after this loop.
416 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
417 i < e; ++i) {
418 EmittedInstrs.push_front(nullptr);
419 }
420
421 // getMaxLookahead() is the largest number of wait states we will ever need
422 // to insert, so there is no point in keeping track of more than that many
423 // wait states.
424 EmittedInstrs.resize(getMaxLookAhead());
425
426 CurrCycleInstr = nullptr;
427}
428
430 assert(!IsHazardRecognizerMode &&
431 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
432}
433
434//===----------------------------------------------------------------------===//
435// Helper Functions
436//===----------------------------------------------------------------------===//
437
438using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
439
440using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
441using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
442
443// Search for a hazard in a block and its predecessors.
444template <typename StateT>
445static bool
446hasHazard(StateT State,
447 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
448 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
449 const MachineBasicBlock *MBB,
452 for (auto E = MBB->instr_rend(); I != E; ++I) {
453 // No need to look at parent BUNDLE instructions.
454 if (I->isBundle())
455 continue;
456
457 switch (IsHazard(State, *I)) {
458 case HazardFound:
459 return true;
460 case HazardExpired:
461 return false;
462 default:
463 // Continue search
464 break;
465 }
466
467 if (I->isInlineAsm() || I->isMetaInstruction())
468 continue;
469
470 UpdateState(State, *I);
471 }
472
473 for (MachineBasicBlock *Pred : MBB->predecessors()) {
474 if (!Visited.insert(Pred).second)
475 continue;
476
477 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
478 Visited))
479 return true;
480 }
481
482 return false;
483}
484
485// Returns a minimum wait states since \p I walking all predecessors.
486// Only scans until \p IsExpired does not return true.
487// Can only be run in a hazard recognizer mode.
493 for (auto E = MBB->instr_rend(); I != E; ++I) {
494 // Don't add WaitStates for parent BUNDLE instructions.
495 if (I->isBundle())
496 continue;
497
498 if (IsHazard(*I))
499 return WaitStates;
500
501 if (I->isInlineAsm())
502 continue;
503
504 WaitStates += GetNumWaitStates(*I);
505
506 if (IsExpired(*I, WaitStates))
507 return std::numeric_limits<int>::max();
508 }
509
510 int MinWaitStates = std::numeric_limits<int>::max();
511 for (MachineBasicBlock *Pred : MBB->predecessors()) {
512 if (!Visited.insert(Pred).second)
513 continue;
514
515 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
516 IsExpired, Visited, GetNumWaitStates);
517
518 MinWaitStates = std::min(MinWaitStates, W);
519 }
520
521 return MinWaitStates;
522}
523
525 const MachineInstr *MI, IsExpiredFn IsExpired) {
527 return getWaitStatesSince(IsHazard, MI->getParent(),
528 std::next(MI->getReverseIterator()), 0, IsExpired,
530}
531
532int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
533 if (IsHazardRecognizerMode) {
534 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
535 return WaitStates >= Limit;
536 };
537 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
538 }
539
540 int WaitStates = 0;
541 for (MachineInstr *MI : EmittedInstrs) {
542 if (MI) {
543 if (IsHazard(*MI))
544 return WaitStates;
545
546 if (MI->isInlineAsm())
547 continue;
548 }
549 ++WaitStates;
550
551 if (WaitStates >= Limit)
552 break;
553 }
554 return std::numeric_limits<int>::max();
555}
556
557int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
558 IsHazardFn IsHazardDef,
559 int Limit) {
560 const SIRegisterInfo *TRI = ST.getRegisterInfo();
561
562 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
563 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
564 };
565
566 return getWaitStatesSince(IsHazardFn, Limit);
567}
568
569int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
570 int Limit) {
571 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
572 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
573 };
574
575 return getWaitStatesSince(IsHazardFn, Limit);
576}
577
578//===----------------------------------------------------------------------===//
579// No-op Hazard Detection
580//===----------------------------------------------------------------------===//
581
582static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
583 MCRegister Reg) {
584 for (MCRegUnit Unit : TRI.regunits(Reg))
585 BV.set(Unit);
586}
587
588static void addRegsToSet(const SIRegisterInfo &TRI,
590 BitVector &DefSet, BitVector &UseSet) {
591 for (const MachineOperand &Op : Ops) {
592 if (Op.isReg())
593 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
594 }
595}
596
597void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
598 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
599}
600
602 return !SIInstrInfo::isSMRD(*MI);
603}
604
606 return !SIInstrInfo::isVMEM(*MI);
607}
608
609int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
610 // SMEM soft clause are only present on VI+, and only matter if xnack is
611 // enabled.
612 if (!ST.isXNACKEnabled())
613 return 0;
614
615 bool IsSMRD = TII.isSMRD(*MEM);
616
617 resetClause();
618
619 // A soft-clause is any group of consecutive SMEM instructions. The
620 // instructions in this group may return out of order and/or may be
621 // replayed (i.e. the same instruction issued more than once).
622 //
623 // In order to handle these situations correctly we need to make sure that
624 // when a clause has more than one instruction, no instruction in the clause
625 // writes to a register that is read by another instruction in the clause
626 // (including itself). If we encounter this situation, we need to break the
627 // clause by inserting a non SMEM instruction.
628
629 for (MachineInstr *MI : EmittedInstrs) {
630 // When we hit a non-SMEM instruction then we have passed the start of the
631 // clause and we can stop.
632 if (!MI)
633 break;
634
636 break;
637
638 addClauseInst(*MI);
639 }
640
641 if (ClauseDefs.none())
642 return 0;
643
644 // We need to make sure not to put loads and stores in the same clause if they
645 // use the same address. For now, just start a new clause whenever we see a
646 // store.
647 if (MEM->mayStore())
648 return 1;
649
650 addClauseInst(*MEM);
651
652 // If the set of defs and uses intersect then we cannot add this instruction
653 // to the clause, so we have a hazard.
654 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
655}
656
657int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
658 int WaitStatesNeeded = 0;
659
660 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
661
662 // This SMRD hazard only affects SI.
663 if (!ST.hasSMRDReadVALUDefHazard())
664 return WaitStatesNeeded;
665
666 // A read of an SGPR by SMRD instruction requires 4 wait states when the
667 // SGPR was written by a VALU instruction.
668 int SmrdSgprWaitStates = 4;
669 auto IsHazardDefFn = [this](const MachineInstr &MI) {
670 return TII.isVALU(MI);
671 };
672 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
673 return TII.isSALU(MI);
674 };
675
676 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
677
678 for (const MachineOperand &Use : SMRD->uses()) {
679 if (!Use.isReg())
680 continue;
681 int WaitStatesNeededForUse =
682 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
683 SmrdSgprWaitStates);
684 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
685
686 // This fixes what appears to be undocumented hardware behavior in SI where
687 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
688 // needs some number of nops in between. We don't know how many we need, but
689 // let's use 4. This wasn't discovered before probably because the only
690 // case when this happens is when we expand a 64-bit pointer into a full
691 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
692 // probably never encountered in the closed-source land.
693 if (IsBufferSMRD) {
694 int WaitStatesNeededForUse =
695 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
696 IsBufferHazardDefFn,
697 SmrdSgprWaitStates);
698 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
699 }
700 }
701
702 return WaitStatesNeeded;
703}
704
705int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
707 return 0;
708
709 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
710
711 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
712 // SGPR was written by a VALU Instruction.
713 const int VmemSgprWaitStates = 5;
714 auto IsHazardDefFn = [this](const MachineInstr &MI) {
715 return TII.isVALU(MI);
716 };
717 for (const MachineOperand &Use : VMEM->uses()) {
718 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
719 continue;
720
721 int WaitStatesNeededForUse =
722 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
723 VmemSgprWaitStates);
724 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
725 }
726 return WaitStatesNeeded;
727}
728
729int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
730 const SIRegisterInfo *TRI = ST.getRegisterInfo();
731 const SIInstrInfo *TII = ST.getInstrInfo();
732
733 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
734 int DppVgprWaitStates = 2;
735 int DppExecWaitStates = 5;
736 int WaitStatesNeeded = 0;
737 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
738 return TII->isVALU(MI);
739 };
740
741 for (const MachineOperand &Use : DPP->uses()) {
742 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
743 continue;
744 int WaitStatesNeededForUse =
745 DppVgprWaitStates - getWaitStatesSinceDef(
746 Use.getReg(),
747 [](const MachineInstr &) { return true; },
748 DppVgprWaitStates);
749 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
750 }
751
752 WaitStatesNeeded = std::max(
753 WaitStatesNeeded,
754 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
755 DppExecWaitStates));
756
757 return WaitStatesNeeded;
758}
759
760int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
761 const SIInstrInfo *TII = ST.getInstrInfo();
762
763 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
764 // instruction.
765 const int DivFMasWaitStates = 4;
766 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
767 return TII->isVALU(MI);
768 };
769 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
770 DivFMasWaitStates);
771
772 return DivFMasWaitStates - WaitStatesNeeded;
773}
774
775int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
776 const SIInstrInfo *TII = ST.getInstrInfo();
777 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
778
779 const int GetRegWaitStates = 2;
780 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
781 return GetRegHWReg == getHWReg(TII, MI);
782 };
783 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
784
785 return GetRegWaitStates - WaitStatesNeeded;
786}
787
788int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
789 const SIInstrInfo *TII = ST.getInstrInfo();
790 unsigned HWReg = getHWReg(TII, *SetRegInstr);
791
792 const int SetRegWaitStates = ST.getSetRegWaitStates();
793 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
794 return HWReg == getHWReg(TII, MI);
795 };
796 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
797 return SetRegWaitStates - WaitStatesNeeded;
798}
799
800int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
801 if (!MI.mayStore())
802 return -1;
803
804 const SIInstrInfo *TII = ST.getInstrInfo();
805 unsigned Opcode = MI.getOpcode();
806 const MCInstrDesc &Desc = MI.getDesc();
807
808 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
809 int VDataRCID = -1;
810 if (VDataIdx != -1)
811 VDataRCID = Desc.operands()[VDataIdx].RegClass;
812
813 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
814 // There is no hazard if the instruction does not use vector regs
815 // (like wbinvl1)
816 if (VDataIdx == -1)
817 return -1;
818 // For MUBUF/MTBUF instructions this hazard only exists if the
819 // instruction is not using a register in the soffset field.
820 const MachineOperand *SOffset =
821 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
822 // If we have no soffset operand, then assume this field has been
823 // hardcoded to zero.
824 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
825 (!SOffset || !SOffset->isReg()))
826 return VDataIdx;
827 }
828
829 // MIMG instructions create a hazard if they don't use a 256-bit T# and
830 // the store size is greater than 8 bytes and they have more than two bits
831 // of their dmask set.
832 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
833 if (TII->isMIMG(MI)) {
834 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
835 assert(SRsrcIdx != -1 &&
836 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
837 (void)SRsrcIdx;
838 }
839
840 if (TII->isFLAT(MI)) {
841 // There is no hazard if the instruction does not use vector regs
842 if (VDataIdx == -1)
843 return -1;
844
845 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
846 return VDataIdx;
847 }
848
849 return -1;
850}
851
852int
853GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
854 const MachineRegisterInfo &MRI) {
855 // Helper to check for the hazard where VMEM instructions that store more than
856 // 8 bytes can have there store data over written by the next instruction.
857 const SIRegisterInfo *TRI = ST.getRegisterInfo();
858
859 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
860 int WaitStatesNeeded = 0;
861
862 if (!TRI->isVectorRegister(MRI, Def.getReg()))
863 return WaitStatesNeeded;
864 Register Reg = Def.getReg();
865 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
866 int DataIdx = createsVALUHazard(MI);
867 return DataIdx >= 0 &&
868 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
869 };
870
871 int WaitStatesNeededForDef =
872 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
873 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
874
875 return WaitStatesNeeded;
876}
877
878/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
879/// pack the computed value into correct bit position of the dest register. This
880/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
881/// dst_sel that is not aligned to the register. This function analayzes the \p
882/// MI and \returns an operand with dst forwarding issue, or nullptr if
883/// none exists.
884static const MachineOperand *
887 return nullptr;
888
889 const SIInstrInfo *TII = ST.getInstrInfo();
890
891 unsigned Opcode = MI.getOpcode();
892
893 // There are three different types of instructions
894 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
895 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
896 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
897 // op_sel[3:2]
898 // != 0
899 if (SIInstrInfo::isSDWA(MI)) {
900 // Type 1: SDWA with dst_sel != DWORD
901 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
902 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
903 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
904 }
905
906 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
907 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
908 // Type 2: VOP3 which write the hi bits
909 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
911 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
912
913 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
914 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
915 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
917 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
918 }
919
920 // Special case: nop is required for all the opsel values for fp4 sr variant
921 // cvt scale instructions
922 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
923 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
924
925 return nullptr;
926}
927
928/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
929/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
930/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
932 const MachineOperand *Dst,
933 const SIRegisterInfo *TRI) {
934 // We must consider implicit reads of the VALU. SDWA with dst_sel and
935 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
936 // and we must account for that hazard.
937 // We also must account for WAW hazards. In particular, WAW with dest
938 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
939 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
940 // check for ECC. Without accounting for this hazard, the ECC will be
941 // wrong.
942 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
943 // complete zeroesHigh16BitsOfDest)
944 for (auto &Operand : VALU->operands()) {
945 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
946 return true;
947 }
948 }
949 return false;
950}
951
952int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
953 int WaitStatesNeeded = 0;
954
956 const int TransDefWaitstates = 1;
957
958 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
960 return false;
961 const SIRegisterInfo *TRI = ST.getRegisterInfo();
962 const SIInstrInfo *TII = ST.getInstrInfo();
963 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
964
965 for (const MachineOperand &Use : VALU->explicit_uses()) {
966 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
967 return true;
968 }
969
970 return false;
971 };
972
973 int WaitStatesNeededForDef =
974 TransDefWaitstates -
975 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
976 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
977 }
978
980 const int Shift16DefWaitstates = 1;
981
982 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
984 const MachineOperand *ForwardedDst =
985 getDstSelForwardingOperand(ProducerMI, ST);
986 if (ForwardedDst) {
987 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
988 }
989
990 if (ProducerMI.isInlineAsm()) {
991 // Assume inline asm has dst forwarding hazard
992 for (auto &Def : ProducerMI.all_defs()) {
993 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
994 return true;
995 }
996 }
997
998 return false;
999 };
1000
1001 int WaitStatesNeededForDef =
1002 Shift16DefWaitstates -
1003 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1005 }
1006
1007 if (ST.hasVDecCoExecHazard()) {
1008 const int VALUWriteSGPRVALUReadWaitstates = 2;
1009 const int VALUWriteEXECRWLane = 4;
1010 const int VALUWriteVGPRReadlaneRead = 1;
1011
1012 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1013 const MachineRegisterInfo &MRI = MF.getRegInfo();
1015 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1016 if (!SIInstrInfo::isVALU(MI))
1017 return false;
1018 return MI.modifiesRegister(UseReg, TRI);
1019 };
1020
1021 for (const MachineOperand &Use : VALU->explicit_uses()) {
1022 if (!Use.isReg())
1023 continue;
1024
1025 UseReg = Use.getReg();
1026 if (TRI->isSGPRReg(MRI, UseReg)) {
1027 int WaitStatesNeededForDef =
1028 VALUWriteSGPRVALUReadWaitstates -
1029 getWaitStatesSince(IsVALUDefSGPRFn,
1030 VALUWriteSGPRVALUReadWaitstates);
1031 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1032 }
1033 }
1034
1035 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1036 UseReg = AMDGPU::VCC;
1037 int WaitStatesNeededForDef =
1038 VALUWriteSGPRVALUReadWaitstates -
1039 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1040 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1041 }
1042
1043 switch (VALU->getOpcode()) {
1044 case AMDGPU::V_READLANE_B32:
1045 case AMDGPU::V_READFIRSTLANE_B32: {
1046 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1047 UseReg = Src->getReg();
1048 int WaitStatesNeededForDef =
1049 VALUWriteVGPRReadlaneRead -
1050 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052 }
1053 [[fallthrough]];
1054 case AMDGPU::V_WRITELANE_B32: {
1055 UseReg = AMDGPU::EXEC;
1056 int WaitStatesNeededForDef =
1057 VALUWriteEXECRWLane -
1058 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1059 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1060 break;
1061 }
1062 default:
1063 break;
1064 }
1065 }
1066
1067 // This checks for the hazard where VMEM instructions that store more than
1068 // 8 bytes can have there store data over written by the next instruction.
1069 if (!ST.has12DWordStoreHazard())
1070 return WaitStatesNeeded;
1071
1072 const MachineRegisterInfo &MRI = MF.getRegInfo();
1073
1074 for (const MachineOperand &Def : VALU->defs()) {
1075 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1076 }
1077
1078 return WaitStatesNeeded;
1079}
1080
1081int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1082 // This checks for hazards associated with inline asm statements.
1083 // Since inline asms can contain just about anything, we use this
1084 // to call/leverage other check*Hazard routines. Note that
1085 // this function doesn't attempt to address all possible inline asm
1086 // hazards (good luck), but is a collection of what has been
1087 // problematic thus far.
1088
1089 // see checkVALUHazards()
1092 return 0;
1093
1094 const MachineRegisterInfo &MRI = MF.getRegInfo();
1095 int WaitStatesNeeded = 0;
1096
1097 for (const MachineOperand &Op :
1099 if (Op.isReg() && Op.isDef()) {
1100 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1101 continue;
1102
1103 if (ST.has12DWordStoreHazard()) {
1104 WaitStatesNeeded =
1105 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1106 }
1107 }
1108 }
1109
1110 if (ST.hasDstSelForwardingHazard()) {
1111 const int Shift16DefWaitstates = 1;
1112
1113 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1114 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1115 // Assume inline asm reads the dst
1116 if (Dst)
1117 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1118 IA->readsRegister(Dst->getReg(), &TRI);
1119
1120 if (ProducerMI.isInlineAsm()) {
1121 // If MI is inline asm, assume it has dst forwarding hazard
1122 for (auto &Def : ProducerMI.all_defs()) {
1123 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1124 IA->readsRegister(Def.getReg(), &TRI)) {
1125 return true;
1126 }
1127 }
1128 }
1129
1130 return false;
1131 };
1132
1133 int WaitStatesNeededForDef =
1134 Shift16DefWaitstates -
1135 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1137 }
1138
1139 return WaitStatesNeeded;
1140}
1141
1142int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1143 const SIInstrInfo *TII = ST.getInstrInfo();
1144 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1145 const MachineRegisterInfo &MRI = MF.getRegInfo();
1146
1147 const MachineOperand *LaneSelectOp =
1148 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1149
1150 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1151 return 0;
1152
1153 Register LaneSelectReg = LaneSelectOp->getReg();
1154 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1155
1156 const int RWLaneWaitStates = 4;
1157 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1158 RWLaneWaitStates);
1159 return RWLaneWaitStates - WaitStatesSince;
1160}
1161
1162int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1163 if (!ST.hasRFEHazards())
1164 return 0;
1165
1166 const SIInstrInfo *TII = ST.getInstrInfo();
1167
1168 const int RFEWaitStates = 1;
1169
1170 auto IsHazardFn = [TII](const MachineInstr &MI) {
1171 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1172 };
1173 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1174 return RFEWaitStates - WaitStatesNeeded;
1175}
1176
1177int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1178 const SIInstrInfo *TII = ST.getInstrInfo();
1179 const int ReadM0WaitStates = 1;
1180 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1181 return ReadM0WaitStates -
1182 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1183}
1184
1185void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1186 fixVMEMtoScalarWriteHazards(MI);
1187 fixVcmpxPermlaneHazards(MI);
1188 fixSMEMtoVectorWriteHazards(MI);
1189 fixVcmpxExecWARHazard(MI);
1190 fixLdsBranchVmemWARHazard(MI);
1191 if (ST.hasLdsDirect()) {
1192 fixLdsDirectVALUHazard(MI);
1193 fixLdsDirectVMEMHazard(MI);
1194 }
1195 fixVALUPartialForwardingHazard(MI);
1196 fixVALUTransUseHazard(MI);
1197 fixVALUTransCoexecutionHazards(MI);
1198 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1199 fixWMMACoexecutionHazards(MI);
1200 fixShift64HighRegBug(MI);
1201 fixVALUMaskWriteHazard(MI);
1202 fixRequiredExportPriority(MI);
1204 fixGetRegWaitIdle(MI);
1206 fixDsAtomicAsyncBarrierArriveB64(MI);
1208 fixScratchBaseForwardingHazard(MI);
1209 if (ST.setRegModeNeedsVNOPs())
1210 fixSetRegMode(MI);
1211}
1212
1214 const MachineInstr &MI) {
1215 return (TII.isVOPC(MI) ||
1216 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1217 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1218}
1219
1220bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1221 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1222 return false;
1223
1224 const SIInstrInfo *TII = ST.getInstrInfo();
1225 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1226 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1227 return isVCmpXWritesExec(*TII, *TRI, MI);
1228 };
1229
1230 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1231 unsigned Opc = MI.getOpcode();
1232 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1233 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1234 };
1235
1236 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1237 std::numeric_limits<int>::max())
1238 return false;
1239
1240 // V_NOP will be discarded by SQ.
1241 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1242 // which is always a VGPR and available.
1243 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1244 Register Reg = Src0->getReg();
1245 bool IsUndef = Src0->isUndef();
1246 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1247 TII->get(AMDGPU::V_MOV_B32_e32))
1248 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1249 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1250
1251 return true;
1252}
1253
1254bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1256 return false;
1258
1260 return false;
1261
1262 if (MI->getNumDefs() == 0)
1263 return false;
1264
1265 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1266
1267 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1269 return false;
1270
1271 for (const MachineOperand &Def : MI->defs()) {
1272 const MachineOperand *Op =
1273 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1274 if (!Op)
1275 continue;
1276 return true;
1277 }
1278 return false;
1279 };
1280
1281 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1282 return SIInstrInfo::isVALU(MI) ||
1283 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1284 !MI.getOperand(0).getImm()) ||
1285 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1286 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1287 };
1288
1289 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1290 std::numeric_limits<int>::max())
1291 return false;
1292
1293 const SIInstrInfo *TII = ST.getInstrInfo();
1294 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1295 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1297 return true;
1298}
1299
1300bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1302 return false;
1304
1305 if (!SIInstrInfo::isVALU(*MI))
1306 return false;
1307
1308 AMDGPU::OpName SDSTName;
1309 switch (MI->getOpcode()) {
1310 case AMDGPU::V_READLANE_B32:
1311 case AMDGPU::V_READFIRSTLANE_B32:
1312 SDSTName = AMDGPU::OpName::vdst;
1313 break;
1314 default:
1315 SDSTName = AMDGPU::OpName::sdst;
1316 break;
1317 }
1318
1319 const SIInstrInfo *TII = ST.getInstrInfo();
1320 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1321 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1322 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1323 if (!SDST) {
1324 for (const auto &MO : MI->implicit_operands()) {
1325 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1326 SDST = &MO;
1327 break;
1328 }
1329 }
1330 }
1331
1332 if (!SDST)
1333 return false;
1334
1335 const Register SDSTReg = SDST->getReg();
1336 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1337 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1338 };
1339
1340 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1341 if (TII->isSALU(MI)) {
1342 switch (MI.getOpcode()) {
1343 case AMDGPU::S_SETVSKIP:
1344 case AMDGPU::S_VERSION:
1345 case AMDGPU::S_WAITCNT_VSCNT:
1346 case AMDGPU::S_WAITCNT_VMCNT:
1347 case AMDGPU::S_WAITCNT_EXPCNT:
1348 // These instructions cannot not mitigate the hazard.
1349 return false;
1350 case AMDGPU::S_WAITCNT_LGKMCNT:
1351 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1352 return (MI.getOperand(1).getImm() == 0) &&
1353 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1354 case AMDGPU::S_WAITCNT: {
1355 const int64_t Imm = MI.getOperand(0).getImm();
1357 // DsCnt corresponds to LGKMCnt here.
1358 return (Decoded.DsCnt == 0);
1359 }
1360 default:
1361 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1362 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1363 "unexpected wait count instruction");
1364 // SOPP instructions cannot mitigate the hazard.
1365 if (TII->isSOPP(MI))
1366 return false;
1367 // At this point the SALU can be assumed to mitigate the hazard
1368 // because either:
1369 // (a) it is independent of the at risk SMEM (breaking chain),
1370 // or
1371 // (b) it is dependent on the SMEM, in which case an appropriate
1372 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1373 // SMEM instruction.
1374 return true;
1375 }
1376 }
1377 return false;
1378 };
1379
1380 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1381 std::numeric_limits<int>::max())
1382 return false;
1383
1384 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1385 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1386 .addImm(0);
1387 return true;
1388}
1389
1390bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1391 if (!ST.hasVcmpxExecWARHazard())
1392 return false;
1394
1395 if (!SIInstrInfo::isVALU(*MI))
1396 return false;
1397
1398 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1399 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1400 return false;
1401
1402 auto IsHazardFn = [TRI](const MachineInstr &I) {
1404 return false;
1405 return I.readsRegister(AMDGPU::EXEC, TRI);
1406 };
1407
1408 const SIInstrInfo *TII = ST.getInstrInfo();
1409 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1410 if (SIInstrInfo::isVALU(MI)) {
1411 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1412 return true;
1413 for (auto MO : MI.implicit_operands())
1414 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1415 return true;
1416 }
1417 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1418 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1419 return true;
1420 return false;
1421 };
1422
1423 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1424 std::numeric_limits<int>::max())
1425 return false;
1426
1427 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1428 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1430 return true;
1431}
1432
1434 const GCNSubtarget &ST) {
1435 if (!ST.hasLdsBranchVmemWARHazard())
1436 return false;
1437
1438 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1439 // instructions need to appear in the same function.
1440 bool HasLds = false;
1441 bool HasVmem = false;
1442 for (auto &MBB : MF) {
1443 for (auto &MI : MBB) {
1444 HasLds |= SIInstrInfo::isDS(MI);
1445 HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
1447 if (HasLds && HasVmem)
1448 return true;
1449 }
1450 }
1451 return false;
1452}
1453
1455 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1456 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1457 !I.getOperand(1).getImm();
1458}
1459
1460bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1461 if (!RunLdsBranchVmemWARHazardFixup)
1462 return false;
1463
1466
1467 auto IsHazardInst = [](const MachineInstr &MI) {
1468 if (SIInstrInfo::isDS(MI))
1469 return 1;
1472 return 2;
1473 return 0;
1474 };
1475
1476 auto InstType = IsHazardInst(*MI);
1477 if (!InstType)
1478 return false;
1479
1480 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1481 return IsHazardInst(I) || isStoreCountWaitZero(I);
1482 };
1483
1484 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1485 if (!I.isBranch())
1486 return false;
1487
1488 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1489 auto InstType2 = IsHazardInst(I);
1490 return InstType2 && InstType != InstType2;
1491 };
1492
1493 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1494 auto InstType2 = IsHazardInst(I);
1495 if (InstType == InstType2)
1496 return true;
1497
1498 return isStoreCountWaitZero(I);
1499 };
1500
1501 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1502 std::numeric_limits<int>::max();
1503 };
1504
1505 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1506 std::numeric_limits<int>::max())
1507 return false;
1508
1509 const SIInstrInfo *TII = ST.getInstrInfo();
1510 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1511 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1512 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1513 .addImm(0);
1514
1515 return true;
1516}
1517
1518bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1520 return false;
1521
1522 const int NoHazardWaitStates = 15;
1523 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1524 const Register VDSTReg = VDST->getReg();
1525
1526 bool VisitedTrans = false;
1527 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1528 if (!SIInstrInfo::isVALU(I))
1529 return false;
1530 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1531 // Cover both WAR and WAW
1532 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1533 };
1534 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1535 if (WaitStates >= NoHazardWaitStates)
1536 return true;
1537 // Instructions which cause va_vdst==0 expire hazard
1540 };
1541 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1542 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1543 };
1544
1546 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1547 std::next(MI->getReverseIterator()), 0,
1548 IsExpiredFn, Visited, GetWaitStatesFn);
1549
1550 // Transcendentals can execute in parallel to other VALUs.
1551 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1552 if (VisitedTrans)
1553 Count = 0;
1554
1555 MachineOperand *WaitVdstOp =
1556 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1557 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1558
1559 return true;
1560}
1561
1562bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1564 return false;
1565
1566 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1567 const Register VDSTReg = VDST->getReg();
1568
1569 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1571 return false;
1572 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1573 };
1574 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1575 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1576 // according to the type of VMEM instruction.
1577 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1579 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1580 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1581 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1582 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1583 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1584 };
1585
1586 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1587 std::numeric_limits<int>::max())
1588 return false;
1589
1590 if (LdsdirCanWait) {
1591 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1592 } else {
1593 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1594 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1596 }
1597
1598 return true;
1599}
1600
1601bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1603 return false;
1605
1606 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1607 return false;
1608
1610
1611 for (const MachineOperand &Use : MI->explicit_uses()) {
1612 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1613 SrcVGPRs.insert(Use.getReg());
1614 }
1615
1616 // Only applies with >= 2 unique VGPR sources
1617 if (SrcVGPRs.size() <= 1)
1618 return false;
1619
1620 // Look for the following pattern:
1621 // Va <- VALU [PreExecPos]
1622 // intv1
1623 // Exec <- SALU [ExecPos]
1624 // intv2
1625 // Vb <- VALU [PostExecPos]
1626 // intv3
1627 // MI Va, Vb (WaitState = 0)
1628 //
1629 // Where:
1630 // intv1 + intv2 <= 2 VALUs
1631 // intv3 <= 4 VALUs
1632 //
1633 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1634
1635 const int Intv1plus2MaxVALUs = 2;
1636 const int Intv3MaxVALUs = 4;
1637 const int IntvMaxVALUs = 6;
1638 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1639
1640 struct StateType {
1642 int ExecPos = std::numeric_limits<int>::max();
1643 int VALUs = 0;
1644 };
1645
1646 StateType State;
1647
1648 // This overloads expiry testing with all the hazard detection
1649 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1650 // Too many VALU states have passed
1651 if (State.VALUs > NoHazardVALUWaitStates)
1652 return HazardExpired;
1653
1654 // Instructions which cause va_vdst==0 expire hazard
1657 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1658 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1659 return HazardExpired;
1660
1661 // Track registers writes
1662 bool Changed = false;
1663 if (SIInstrInfo::isVALU(I)) {
1664 for (Register Src : SrcVGPRs) {
1665 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1666 State.DefPos[Src] = State.VALUs;
1667 Changed = true;
1668 }
1669 }
1670 } else if (SIInstrInfo::isSALU(I)) {
1671 if (State.ExecPos == std::numeric_limits<int>::max()) {
1672 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1673 State.ExecPos = State.VALUs;
1674 Changed = true;
1675 }
1676 }
1677 }
1678
1679 // Early expiration: too many VALUs in intv3
1680 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1681 return HazardExpired;
1682
1683 // Only evaluate state if something changed
1684 if (!Changed)
1685 return NoHazardFound;
1686
1687 // Determine positions of VALUs pre/post exec change
1688 if (State.ExecPos == std::numeric_limits<int>::max())
1689 return NoHazardFound;
1690
1691 int PreExecPos = std::numeric_limits<int>::max();
1692 int PostExecPos = std::numeric_limits<int>::max();
1693
1694 for (auto Entry : State.DefPos) {
1695 int DefVALUs = Entry.second;
1696 if (DefVALUs != std::numeric_limits<int>::max()) {
1697 if (DefVALUs >= State.ExecPos)
1698 PreExecPos = std::min(PreExecPos, DefVALUs);
1699 else
1700 PostExecPos = std::min(PostExecPos, DefVALUs);
1701 }
1702 }
1703
1704 // Need a VALUs post exec change
1705 if (PostExecPos == std::numeric_limits<int>::max())
1706 return NoHazardFound;
1707
1708 // Too many VALUs in intv3?
1709 int Intv3VALUs = PostExecPos;
1710 if (Intv3VALUs > Intv3MaxVALUs)
1711 return HazardExpired;
1712
1713 // Too many VALUs in intv2?
1714 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1715 if (Intv2VALUs > Intv1plus2MaxVALUs)
1716 return HazardExpired;
1717
1718 // Need a VALUs pre exec change
1719 if (PreExecPos == std::numeric_limits<int>::max())
1720 return NoHazardFound;
1721
1722 // Too many VALUs in intv1?
1723 int Intv1VALUs = PreExecPos - State.ExecPos;
1724 if (Intv1VALUs > Intv1plus2MaxVALUs)
1725 return HazardExpired;
1726
1727 // Too many VALUs in intv1 + intv2
1728 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1729 return HazardExpired;
1730
1731 return HazardFound;
1732 };
1733 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1735 State.VALUs += 1;
1736 };
1737
1739 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1740 std::next(MI->getReverseIterator()), Visited))
1741 return false;
1742
1743 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1744 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1746
1747 return true;
1748}
1749
1750bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1751 if (!ST.hasVALUTransUseHazard())
1752 return false;
1754
1755 if (!SIInstrInfo::isVALU(*MI))
1756 return false;
1757
1758 SmallSet<Register, 4> SrcVGPRs;
1759
1760 for (const MachineOperand &Use : MI->explicit_uses()) {
1761 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1762 SrcVGPRs.insert(Use.getReg());
1763 }
1764
1765 // Look for the following pattern:
1766 // Va <- TRANS VALU
1767 // intv
1768 // MI Va (WaitState = 0)
1769 //
1770 // Where:
1771 // intv <= 5 VALUs / 1 TRANS
1772 //
1773 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1774
1775 const int IntvMaxVALUs = 5;
1776 const int IntvMaxTRANS = 1;
1777
1778 struct StateType {
1779 int VALUs = 0;
1780 int TRANS = 0;
1781 };
1782
1783 StateType State;
1784
1785 // This overloads expiry testing with all the hazard detection
1786 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1787 // Too many VALU states have passed
1788 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1789 return HazardExpired;
1790
1791 // Instructions which cause va_vdst==0 expire hazard
1794 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1795 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1796 return HazardExpired;
1797
1798 // Track registers writes
1799 if (SIInstrInfo::isTRANS(I)) {
1800 for (Register Src : SrcVGPRs) {
1801 if (I.modifiesRegister(Src, &TRI)) {
1802 return HazardFound;
1803 }
1804 }
1805 }
1806
1807 return NoHazardFound;
1808 };
1809 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1811 State.VALUs += 1;
1813 State.TRANS += 1;
1814 };
1815
1817 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1818 std::next(MI->getReverseIterator()), Visited))
1819 return false;
1820
1821 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1822 // avoided.
1823 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1824 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1826
1827 return true;
1828}
1829
1830bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1831 if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
1833 return false;
1834
1835 const SIInstrInfo *TII = ST.getInstrInfo();
1836 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1837
1838 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1839 if (!SIInstrInfo::isTRANS(I))
1840 return false;
1841
1842 // RAW: Trans(I) writes, VALU(MI) reads.
1843 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1844 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1845 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1846 return true;
1847 }
1848
1849 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1850 if (!ValuDst || !ValuDst->isReg())
1851 return false;
1852
1853 // WAR: Trans(I) reads, VALU(MI) writes.
1854 Register ValuDef = ValuDst->getReg();
1855 for (const MachineOperand &TransUse : I.explicit_uses()) {
1856 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1857 return true;
1858 }
1859
1860 return false;
1861 };
1862
1863 auto IsExpiredFn = [](const MachineInstr &I, int) {
1864 return SIInstrInfo::isVALU(I);
1865 };
1866
1867 const int HasVALU = std::numeric_limits<int>::max();
1868 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1869 return false;
1870
1871 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1872 return true;
1873}
1874
1875bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1877 return false;
1878
1879 const SIInstrInfo *TII = ST.getInstrInfo();
1880 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1881
1882 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1884 return false;
1885
1886 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1887 // with the dest(matrix D) of the previous wmma.
1888 const Register CurSrc0Reg =
1889 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1890 const Register CurSrc1Reg =
1891 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1892
1893 const Register PrevDstReg =
1894 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1895
1896 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1897 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1898 return true;
1899 }
1900
1901 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1902 // but Index can't overlap with PrevDstReg.
1903 if (AMDGPU::isGFX12Plus(ST)) {
1904 if (SIInstrInfo::isSWMMAC(*MI)) {
1905 const Register CurIndex =
1906 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1907 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1908 return true;
1909 }
1910 return false;
1911 }
1912
1913 return false;
1914 };
1915
1916 auto IsExpiredFn = [](const MachineInstr &I, int) {
1917 return SIInstrInfo::isVALU(I);
1918 };
1919
1920 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1921 std::numeric_limits<int>::max())
1922 return false;
1923
1924 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1925
1926 return true;
1927}
1928
1931 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
1932}
1933
1935 const SIInstrInfo *TII, unsigned Latency,
1936 unsigned Category) {
1937 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
1938 "Handle me if the xdl wmma instruction latency changes");
1939
1940 switch (Category) {
1941 case 0: // Dense WMMA Instructions:
1942 // WMMA_*F16, WMMA_*BF16
1943 // WMMA_*FP8FP8
1944 // WMMA_*FP8BF8
1945 // WMMA_*BF8FP8
1946 // WMMA_*BF8BF8
1947 // WMMA_*F8F6F4 if SRCA & SRCB != F8
1948 return Latency == 8 && SIInstrInfo::isWMMA(MI);
1949
1950 case 1: // Dense WMMA Instructions:
1951 // WMMA_IU8
1952 // WMMA_IU4
1953 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
1954 return Latency == 16 && SIInstrInfo::isWMMA(MI);
1955
1956 case 2: // Dense SWMMAC Instructions
1957 // SWMMAC_*F16, SWMMAC_*BF16,
1958 // SWMMAC_*FP8FP8
1959 // SWMMAC_*BF8FP8
1960 // SWMMAC_*FP8BF8
1961 // SWMMAC_*BF8BF8
1962 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
1963
1964 case 3: // Sparse WMMA Instructions:
1965 // SWMMAC_IU8
1966 // SWMMAC_IU4
1967 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
1968 default:
1969 break;
1970 } // end switch.
1971
1972 return false;
1973}
1974
1975bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
1976 if (!AMDGPU::isGFX1250(ST))
1977 return false;
1978
1979 const SIInstrInfo *TII = ST.getInstrInfo();
1980 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
1981 return false;
1982
1983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1984
1985 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
1986 // be in between the first WMMA and the second instruction to cover the hazard
1987 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
1988 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
1989 // numbers, which depends on the category of the first WMMA.
1990 const int WMMAWaitStates[] = {5, 9, 3, 5};
1991 const int VALUWaitStates[] = {4, 8, 2, 4};
1992 unsigned Category = 0;
1993
1994 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
1995 if (!TII->isXDLWMMA(I))
1996 return false;
1997
1998 unsigned Latency = TSchedModel.computeInstrLatency(&I);
1999 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2000 return false;
2001
2002 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2003 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2004 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2005
2006 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2007 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2008 return true;
2009
2010 if (SIInstrInfo::isSWMMAC(*MI)) {
2011 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2012 if (TRI->regsOverlap(D0, Idx1))
2013 return true;
2014 }
2015
2016 return false;
2017 };
2018
2019 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2020 if (!TII->isXDLWMMA(I))
2021 return false;
2022
2023 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2024 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2025 return false;
2026
2027 // WMMA writes, VALU reads.
2028 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2029 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2030 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2031 return true;
2032 }
2033
2034 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2035 if (!ValuDst || !ValuDst->isReg())
2036 return false;
2037 Register D1 = ValuDst->getReg();
2038
2039 // WMMA writes, VALU writes.
2040 if (TRI->regsOverlap(D0, D1))
2041 return true;
2042
2043 // WMMA reads, VALU writes.
2044 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
2045 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
2046 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2047 return true;
2048
2049 if (SIInstrInfo::isSWMMAC(I)) {
2050 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
2051 if (TRI->regsOverlap(D1, Idx0))
2052 return true;
2053 }
2054
2055 return false;
2056 };
2057
2058 int Limit = 0;
2059 auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
2060 return WaitStates >= Limit;
2061 };
2062
2063 auto GetWaitStatesFn = [](const MachineInstr &I) {
2064 return SIInstrInfo::isVALU(I) ? 1 : 0;
2065 };
2066
2067 int WaitStatesNeeded = -1;
2068 if (TII->isXDLWMMA(*MI)) {
2069 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2070 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2072 // '::getWaitStatesSince' returns the number of VALUs in between if hazard
2073 // exists, and INT_MAX if there is no hazard. As a result, a negative
2074 // WaitStatesNeeded here means no hazard, and we will continue to search
2075 // for other categories.
2076 WaitStatesNeeded =
2077 Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
2078 std::next(MI->getReverseIterator()), 0,
2079 IsExpiredFn, Visited, GetWaitStatesFn);
2080 }
2081 } else { // Must be a co-executable VALU.
2082 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2083 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2085 // '::getWaitStatesSince' returns the number of VALUs in between if hazard
2086 // exists, and INT_MAX if there is no hazard. As a result, a negative
2087 // WaitStatesNeeded here means no hazard, and we will continue to search
2088 // for other categories.
2089 WaitStatesNeeded =
2090 Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
2091 std::next(MI->getReverseIterator()), 0,
2092 IsExpiredFn, Visited, GetWaitStatesFn);
2093 }
2094 }
2095
2096 // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
2097 // means not needed.
2098 for (int i = 0; i < WaitStatesNeeded; i++)
2099 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2100 TII->get(AMDGPU::V_NOP_e32));
2101
2102 return true;
2103}
2104
2105bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2106 if (!ST.hasShift64HighRegBug())
2107 return false;
2109
2110 switch (MI->getOpcode()) {
2111 default:
2112 return false;
2113 case AMDGPU::V_LSHLREV_B64_e64:
2114 case AMDGPU::V_LSHRREV_B64_e64:
2115 case AMDGPU::V_ASHRREV_I64_e64:
2116 break;
2117 }
2118
2119 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2120 if (!Amt->isReg())
2121 return false;
2122
2123 Register AmtReg = Amt->getReg();
2124 const MachineRegisterInfo &MRI = MF.getRegInfo();
2125 // Check if this is a last VGPR in the allocation block.
2126 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2127 return false;
2128
2129 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2130 return false;
2131
2132 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2133 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
2134 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
2135 bool Overlapped = OverlappedSrc || OverlappedDst;
2136
2137 assert(!OverlappedDst || !OverlappedSrc ||
2138 Src1->getReg() == MI->getOperand(0).getReg());
2140 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2141
2142 Register NewReg;
2143 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2144 : AMDGPU::VGPR_32RegClass) {
2145 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2146 NewReg = Reg;
2147 break;
2148 }
2149 }
2150
2151 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2152 : NewReg;
2153 Register NewAmtLo;
2154
2155 if (Overlapped)
2156 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2157
2158 DebugLoc DL = MI->getDebugLoc();
2159 MachineBasicBlock *MBB = MI->getParent();
2160 // Insert a full wait count because found register might be pending a wait.
2161 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2162 .addImm(0);
2163
2164 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2165 if (Overlapped)
2166 runOnInstruction(
2167 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2168 .addDef(AmtReg - 1)
2169 .addReg(AmtReg - 1, RegState::Undef)
2170 .addReg(NewAmtLo, RegState::Undef));
2171 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2172 .addDef(AmtReg)
2173 .addReg(AmtReg, RegState::Undef)
2174 .addReg(NewAmt, RegState::Undef));
2175
2176 // Instructions emitted after the current instruction will be processed by the
2177 // parent loop of the hazard recognizer in a natural way.
2178 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2179 AmtReg)
2180 .addDef(NewAmt)
2181 .addReg(NewAmt)
2182 .addReg(AmtReg);
2183 if (Overlapped)
2184 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2185 AmtReg - 1)
2186 .addDef(NewAmtLo)
2187 .addReg(NewAmtLo)
2188 .addReg(AmtReg - 1);
2189
2190 // Re-running hazard recognizer on the modified instruction is not necessary,
2191 // inserted V_SWAP_B32 has already both read and write new registers so
2192 // hazards related to these register has already been handled.
2193 Amt->setReg(NewAmt);
2194 Amt->setIsKill(false);
2195 // We do not update liveness, so verifier may see it as undef.
2196 Amt->setIsUndef();
2197 if (OverlappedDst)
2198 MI->getOperand(0).setReg(NewReg);
2199 if (OverlappedSrc) {
2200 Src1->setReg(NewReg);
2201 Src1->setIsKill(false);
2202 Src1->setIsUndef();
2203 }
2204
2205 return true;
2206}
2207
2208int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2209 int NSAtoVMEMWaitStates = 1;
2210
2211 if (!ST.hasNSAtoVMEMBug())
2212 return 0;
2213
2215 return 0;
2216
2217 const SIInstrInfo *TII = ST.getInstrInfo();
2218 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2219 if (!Offset || (Offset->getImm() & 6) == 0)
2220 return 0;
2221
2222 auto IsHazardFn = [TII](const MachineInstr &I) {
2223 if (!SIInstrInfo::isMIMG(I))
2224 return false;
2225 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2226 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2227 TII->getInstSizeInBytes(I) >= 16;
2228 };
2229
2230 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2231}
2232
2233int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2234 int FPAtomicToDenormModeWaitStates = 3;
2235
2237 return 0;
2239
2240 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2241 return 0;
2242
2243 auto IsHazardFn = [](const MachineInstr &I) {
2244 if (!SIInstrInfo::isVMEM(I))
2245 return false;
2246 return SIInstrInfo::isFPAtomic(I);
2247 };
2248
2249 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2250 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2251 return true;
2252
2253 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2254 };
2255
2256 return FPAtomicToDenormModeWaitStates -
2257 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2258}
2259
2260int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2262
2263 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2264}
2265
2266int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2267 // Early exit if no padding is requested.
2268 if (MFMAPaddingRatio == 0)
2269 return 0;
2270
2272 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2273 return 0;
2274
2275 int NeighborMFMALatency = 0;
2276 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2277 this](const MachineInstr &MI) {
2278 if (!SIInstrInfo::isMFMA(MI))
2279 return false;
2280
2281 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2282 return true;
2283 };
2284
2285 const int MaxMFMAPipelineWaitStates = 16;
2286 int WaitStatesSinceNeighborMFMA =
2287 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2288
2289 int NeighborMFMAPaddingNeeded =
2290 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2291 WaitStatesSinceNeighborMFMA;
2292
2293 return std::max(0, NeighborMFMAPaddingNeeded);
2294}
2295
2296int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2297 int WaitStatesNeeded = 0;
2298 unsigned Opc = MI->getOpcode();
2299
2300 auto IsVALUFn = [](const MachineInstr &MI) {
2301 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2302 };
2303
2304 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2305 const int LegacyVALUWritesVGPRWaitStates = 2;
2306 const int VALUWritesExecWaitStates = 4;
2307 const int MaxWaitStates = 4;
2308
2309 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2310 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2311 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2312
2313 if (WaitStatesNeeded < MaxWaitStates) {
2314 for (const MachineOperand &Use : MI->explicit_uses()) {
2315 const int MaxWaitStates = 2;
2316
2317 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2318 continue;
2319
2320 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2321 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2322 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2323
2324 if (WaitStatesNeeded == MaxWaitStates)
2325 break;
2326 }
2327 }
2328 }
2329
2330 for (const MachineOperand &Op : MI->explicit_operands()) {
2331 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2332 continue;
2333
2334 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2335 continue;
2336
2337 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2338 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2339 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2340 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2341 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2342 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2343 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2344 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2345 const int MaxWaitStates = 18;
2346 Register Reg = Op.getReg();
2347 unsigned HazardDefLatency = 0;
2348
2349 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2350 this](const MachineInstr &MI) {
2351 if (!SIInstrInfo::isMFMA(MI))
2352 return false;
2353 Register DstReg = MI.getOperand(0).getReg();
2354 if (DstReg == Reg)
2355 return false;
2356 HazardDefLatency =
2357 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2358 return TRI.regsOverlap(DstReg, Reg);
2359 };
2360
2361 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2362 MaxWaitStates);
2363 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2364 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2365 int OpNo = Op.getOperandNo();
2366 if (OpNo == SrcCIdx) {
2367 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2368 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2369 switch (HazardDefLatency) {
2370 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2371 break;
2372 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2373 break;
2374 case 16: [[fallthrough]];
2375 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2376 break;
2377 }
2378 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2379 switch (HazardDefLatency) {
2380 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2381 break;
2382 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2383 break;
2384 case 16: [[fallthrough]];
2385 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2386 break;
2387 }
2388 }
2389
2390 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2391 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2392
2393 if (WaitStatesNeeded == MaxWaitStates)
2394 return WaitStatesNeeded; // Early exit.
2395
2396 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2397 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2398 return false;
2399 Register DstReg = MI.getOperand(0).getReg();
2400 return TRI.regsOverlap(Reg, DstReg);
2401 };
2402
2403 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2404 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2405 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2406 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2407 if (OpNo == SrcCIdx)
2408 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2409 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2410 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2411
2412 WaitStatesNeededForUse = NeedWaitStates -
2413 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2414 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2415
2416 if (WaitStatesNeeded == MaxWaitStates)
2417 return WaitStatesNeeded; // Early exit.
2418 }
2419
2420 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2421 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2422 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2423 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2424 const int MaxWaitStates = 13;
2425 Register DstReg = MI->getOperand(0).getReg();
2426 unsigned HazardDefLatency = 0;
2427
2428 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2429 this](const MachineInstr &MI) {
2430 if (!SIInstrInfo::isMFMA(MI))
2431 return false;
2432 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2433 HazardDefLatency =
2434 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2435 return TRI.regsOverlap(Reg, DstReg);
2436 };
2437
2438 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2439 int NeedWaitStates;
2440 switch (HazardDefLatency) {
2441 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2442 break;
2443 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2444 break;
2445 case 16: [[fallthrough]];
2446 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2447 break;
2448 }
2449
2450 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2451 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2452 }
2453
2454 // Pad neighboring MFMA with noops for better inter-wave performance.
2455 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2456
2457 return WaitStatesNeeded;
2458}
2459
2460static int
2462 bool IsGFX950) {
2463 // xdl def cycles | gfx940 | gfx950
2464 // 2 pass | 3 4
2465 // 4 pass | 5 6
2466 // 8 pass | 9 10
2467 // 16 pass | 17 18
2468 return NumPasses + 1 + IsGFX950;
2469}
2470
2471static int
2473 bool IsGFX950) {
2474 // xdl def cycles | gfx940 | gfx950
2475 // 2 pass | 3 3
2476 // 4 pass | 5 6
2477 // 8 pass | 9 10
2478 // 16 pass | 17 18
2479 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2480}
2481
2482static int
2484 // 2 pass -> 2
2485 // 4 pass -> 4
2486 // 8 pass -> 8
2487 // 16 pass -> 16
2488 return NumPasses;
2489}
2490
2491static int
2493 // 2 pass -> 4
2494 // 4 pass -> 6
2495 // 8 pass -> 10
2496 // 16 pass -> 18
2497 return NumPasses + 2;
2498}
2499
2501 bool IsGFX950) {
2502 // xdl def cycles | gfx942 | gfx950
2503 // 2 pass | 5 5
2504 // 4 pass | 7 8
2505 // 8 pass | 11 12
2506 // 16 pass | 19 20
2507 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2508}
2509
2510int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2511 int WaitStatesNeeded = 0;
2512 unsigned Opc = MI->getOpcode();
2513
2514 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2516 };
2517
2518 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2521 };
2522
2523 if (!SIInstrInfo::isMFMA(*MI))
2524 return WaitStatesNeeded;
2525
2526 const int VALUWritesExecWaitStates = 4;
2527 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2528 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2529 VALUWritesExecWaitStates);
2530 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2531
2532 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2533
2534 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2535 for (const MachineOperand &Use : MI->explicit_uses()) {
2536 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2537 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2538 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2539 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2540 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2541 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2542 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2543 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2544 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2545 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2546 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2547 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2548 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2549 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2550 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2551 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2552 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2553 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2554 const int MaxWaitStates = 19;
2555
2556 if (!Use.isReg())
2557 continue;
2558 Register Reg = Use.getReg();
2559 bool FullReg;
2560 const MachineInstr *MI1;
2561
2562 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2563 this](const MachineInstr &MI) {
2564 if (!SIInstrInfo::isMFMA(MI))
2565 return false;
2566 Register DstReg = MI.getOperand(0).getReg();
2567 FullReg = (DstReg == Reg);
2568 MI1 = &MI;
2569 return TRI.regsOverlap(DstReg, Reg);
2570 };
2571
2572 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2573 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2574 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2575
2576 int NumWaitStates =
2577 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2578 if (NumWaitStates == std::numeric_limits<int>::max())
2579 continue;
2580
2581 int OpNo = Use.getOperandNo();
2582 unsigned Opc1 = MI1->getOpcode();
2583 int NeedWaitStates = 0;
2584 if (OpNo == SrcCIdx) {
2585 if (!SIInstrInfo::isDGEMM(Opc) &&
2586 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2587 NeedWaitStates = 0;
2588 } else if (FullReg) {
2589 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2590 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2591 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2592 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2593 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2594 else if (ST.hasGFX940Insts() &&
2595 TSchedModel.computeInstrLatency(MI1) == 2)
2596 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2597 } else {
2598 switch (Opc1) {
2599 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2600 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2601 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2602 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2603 if (!TII.isXDL(*MI))
2604 NeedWaitStates =
2605 ST.hasGFX950Insts()
2606 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2607 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2608 break;
2609 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2610 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2611 if (!TII.isXDL(*MI))
2612 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2613 break;
2614 default:
2615 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2616 if (ST.hasGFX940Insts()) {
2617 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2618 break;
2619
2620 NeedWaitStates =
2621 TII.isXDL(*MI1)
2622 ? (TII.isXDL(*MI)
2624 NumPasses, ST.hasGFX950Insts())
2626 NumPasses, ST.hasGFX950Insts()))
2628 NumPasses);
2629 break;
2630 }
2631
2632 switch (NumPasses) {
2633 case 2:
2634 NeedWaitStates =
2636 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2637 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2638 break;
2639 case 8:
2640 NeedWaitStates =
2642 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2643 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2644 break;
2645 case 16:
2646 NeedWaitStates =
2648 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2649 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2650 break;
2651 default:
2652 llvm_unreachable("unexpected number of passes");
2653 }
2654 }
2655 }
2656 } else {
2657 switch (Opc1) {
2658 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2659 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2660 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2661 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2662 NeedWaitStates =
2663 ST.hasGFX950Insts()
2664 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2665 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2666 break;
2667 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2668 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2669 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2670 break;
2671 default:
2672 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2673
2674 if (ST.hasGFX940Insts()) {
2675 NeedWaitStates =
2676 TII.isXDL(*MI1)
2678 NumPasses, ST.hasGFX950Insts())
2680 NumPasses);
2681 break;
2682 }
2683
2684 switch (NumPasses) {
2685 case 2:
2686 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2687 break;
2688 case 4:
2689 llvm_unreachable("unexpected number of passes for mfma");
2690 case 8:
2691 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2692 break;
2693 case 16:
2694 default:
2695 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2696 }
2697 }
2698 }
2699 if (WaitStatesNeeded >= NeedWaitStates)
2700 continue;
2701
2702 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2703 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2704
2705 if (WaitStatesNeeded == MaxWaitStates)
2706 break;
2707 }
2708
2709 // Pad neighboring MFMA with noops for better inter-wave performance.
2710 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2711
2712 return WaitStatesNeeded;
2713}
2714
2715int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2716 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2717 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2718 return 0;
2719
2720 int WaitStatesNeeded = 0;
2721
2722 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2723 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2724 };
2725
2726 for (const MachineOperand &Op : MI->explicit_uses()) {
2727 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2728 continue;
2729
2730 Register Reg = Op.getReg();
2731
2732 const int AccVgprReadLdStWaitStates = 2;
2733 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2734 const int MaxWaitStates = 2;
2735
2736 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2737 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2738 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2739
2740 if (WaitStatesNeeded == MaxWaitStates)
2741 return WaitStatesNeeded; // Early exit.
2742
2743 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2744 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2745 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2746 return false;
2747 auto IsVALUFn = [](const MachineInstr &MI) {
2749 };
2750 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2751 std::numeric_limits<int>::max();
2752 };
2753
2754 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2755 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2756 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2757 }
2758
2759 return WaitStatesNeeded;
2760}
2761
2762int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2764 "this is a different vcmpx+permlane hazard");
2765 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2766 const SIInstrInfo *TII = ST.getInstrInfo();
2767
2768 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2769 return isVCmpXWritesExec(*TII, *TRI, MI);
2770 };
2771
2772 auto IsVALUFn = [](const MachineInstr &MI) {
2773 return SIInstrInfo::isVALU(MI);
2774 };
2775
2776 const int VCmpXWritesExecWaitStates = 4;
2777 const int VALUWritesVDstWaitStates = 2;
2778 int WaitStatesNeeded = 0;
2779
2780 for (const MachineOperand &Op : MI->explicit_uses()) {
2781 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2782 continue;
2783 Register Reg = Op.getReg();
2784
2785 int WaitStatesSinceDef =
2786 VALUWritesVDstWaitStates -
2787 getWaitStatesSinceDef(Reg, IsVALUFn,
2788 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2789 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2790 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2791 break;
2792 }
2793
2794 int VCmpXHazardWaits =
2795 VCmpXWritesExecWaitStates -
2796 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2797
2798 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2799 return WaitStatesNeeded;
2800}
2801
2803 // 2 pass -> 4
2804 // 4 pass -> 6
2805 // 8 pass -> 10
2806 // 16 pass -> 18
2807 return NumPasses + 2;
2808}
2809
2811 bool IsGFX950) {
2812 // xdl def cycles | gfx942 | gfx950
2813 // 2 pass | 5 5
2814 // 4 pass | 7 8
2815 // 8 pass | 11 12
2816 // 16 pass | 19 20
2817 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2818}
2819
2821 bool IsGFX950) {
2822 // xdl def cycles | gfx942 | gfx950
2823 // 2 pass | 5 5
2824 // 4 pass | 7 8
2825 // 8 pass | 11 12
2826 // 16 pass | 19 20
2827 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2828}
2829
2831 // 2 pass -> 4
2832 // 4 pass -> 6
2833 // 8 pass -> 10
2834 // 16 pass -> 18
2835 return NumPasses + 2;
2836}
2837
2838int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2839 if (!ST.hasGFX90AInsts())
2840 return 0;
2841
2842 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2843 return SIInstrInfo::isDGEMM(MI.getOpcode());
2844 };
2845
2846 // This is checked in checkMAIHazards90A()
2847 if (SIInstrInfo::isMFMA(*MI))
2848 return 0;
2849
2850 const MachineRegisterInfo &MRI = MF.getRegInfo();
2851
2852 int WaitStatesNeeded = 0;
2853
2854 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2855 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2856 bool IsVALU = SIInstrInfo::isVALU(*MI);
2857
2858 const MachineInstr *MFMA = nullptr;
2859 unsigned Reg;
2860 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2861 if (!SIInstrInfo::isMFMA(MI) ||
2862 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2863 return false;
2864 MFMA = &MI;
2865 return true;
2866 };
2867
2868 const MachineInstr *DOT = nullptr;
2869 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2870 if (!SIInstrInfo::isDOT(MI) ||
2871 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2872 return false;
2873 DOT = &MI;
2874 return true;
2875 };
2876
2877 bool DGEMMAfterVALUWrite = false;
2878 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2879 // Found DGEMM on reverse traversal to def.
2880 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2881 DGEMMAfterVALUWrite = true;
2882
2883 // Only hazard if register is defined by a VALU and a DGEMM is found after
2884 // after the def.
2885 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2886 return false;
2887
2888 return true;
2889 };
2890
2891 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2892 AMDGPU::OpName::src2);
2893
2894 if (IsMemOrExport || IsVALU) {
2895 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2896 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2897 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2898 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2899 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2900 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2901 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2902 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2903 const int DotWriteSameDotReadSrcAB = 3;
2904 const int DotWriteDifferentVALURead = 3;
2905 const int DMFMABetweenVALUWriteVMEMRead = 2;
2906 const int MaxWaitStates = 19;
2907
2908 for (const MachineOperand &Use : MI->explicit_uses()) {
2909 if (!Use.isReg())
2910 continue;
2911 Reg = Use.getReg();
2912
2913 DOT = nullptr;
2914 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2915 MaxWaitStates);
2916 if (DOT) {
2917 int NeedWaitStates = 0;
2918 if (DOT->getOpcode() == MI->getOpcode()) {
2919 if (&Use - &MI->getOperand(0) != SrcCIdx)
2920 NeedWaitStates = DotWriteSameDotReadSrcAB;
2921 } else {
2922 NeedWaitStates = DotWriteDifferentVALURead;
2923 }
2924
2925 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2926 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2927 }
2928
2929 // Workaround for HW data hazard bug observed only in GFX90A. When there
2930 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2931 // causes the SQ to incorrectly not insert two wait states between the two
2932 // instructions needed to avoid data hazard.
2933 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2934 DGEMMAfterVALUWrite = false;
2935 if (TRI.isVectorRegister(MRI, Reg)) {
2936 int WaitStatesNeededForUse =
2937 DMFMABetweenVALUWriteVMEMRead -
2938 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2939 DMFMABetweenVALUWriteVMEMRead);
2940
2941 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2942 }
2943 }
2944
2945 MFMA = nullptr;
2946 WaitStatesSinceDef =
2947 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2948 if (!MFMA)
2949 continue;
2950
2951 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2952 int NumPasses = HazardDefLatency;
2953 int NeedWaitStates = MaxWaitStates;
2954
2955 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
2956 switch (HazardDefLatency) {
2957 case 4:
2958 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2959 : DMFMA4x4WriteVgprVALUReadWaitStates;
2960 break;
2961 case 8:
2962 case 16:
2963 NeedWaitStates =
2964 IsMemOrExport
2965 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2966 : (ST.hasGFX950Insts()
2967 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2968 : DMFMA16x16WriteVgprVALUReadWaitStates);
2969 break;
2970 default:
2971 llvm_unreachable("unexpected dgemm");
2972 }
2973 } else if (ST.hasGFX940Insts()) {
2974 NeedWaitStates =
2975 TII.isXDL(*MFMA)
2977 NumPasses, ST.hasGFX950Insts())
2979 NumPasses);
2980 } else {
2981 switch (HazardDefLatency) {
2982 case 2:
2983 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2984 break;
2985 case 8:
2986 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2987 break;
2988 case 16:
2989 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2990 break;
2991 default:
2992 llvm_unreachable("unexpected number of passes for mfma");
2993 }
2994 }
2995
2996 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2997 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2998
2999 if (WaitStatesNeeded == MaxWaitStates)
3000 break;
3001 }
3002 }
3003
3004 unsigned Opc = MI->getOpcode();
3005 const int DMFMAToFMA64WaitStates = 2;
3006 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3007 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3008 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3009 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3010 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3011 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3012 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3013 }
3014
3015 if (!IsVALU && !IsMemOrExport)
3016 return WaitStatesNeeded;
3017
3018 for (const MachineOperand &Def : MI->defs()) {
3019 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3020 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3021 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3022 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3023 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3024 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3025 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3026 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3027 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3028 const int DotWriteDifferentVALUWrite = 3;
3029 const int MaxWaitStates = 19;
3030 const int MaxWarWaitStates = 15;
3031
3032 Reg = Def.getReg();
3033
3034 DOT = nullptr;
3035 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3036 MaxWaitStates);
3037 if (DOT && DOT->getOpcode() != MI->getOpcode())
3038 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3039 WaitStatesSinceDef);
3040
3041 MFMA = nullptr;
3042 WaitStatesSinceDef =
3043 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3044 if (MFMA) {
3045 int NeedWaitStates = MaxWaitStates;
3046 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3047
3048 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3049 switch (NumPasses) {
3050 case 4:
3051 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3052 break;
3053 case 8:
3054 case 16:
3055 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3056 break;
3057 default:
3058 llvm_unreachable("unexpected number of cycles for dgemm");
3059 }
3060 } else if (ST.hasGFX940Insts()) {
3061 NeedWaitStates =
3062 TII.isXDL(*MFMA)
3064 NumPasses, ST.hasGFX950Insts())
3066 } else {
3067 switch (NumPasses) {
3068 case 2:
3069 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3070 break;
3071 case 8:
3072 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3073 break;
3074 case 16:
3075 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3076 break;
3077 default:
3078 llvm_unreachable("Unexpected number of passes for mfma");
3079 }
3080 }
3081
3082 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3083 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3084
3085 if (WaitStatesNeeded == MaxWaitStates)
3086 break;
3087 }
3088
3089 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3090 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3091 !MI.readsRegister(Reg, &TRI))
3092 return false;
3093
3094 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3095 return false;
3096
3097 const MachineOperand *SrcC =
3098 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3099 assert(SrcC);
3100 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3101 return false;
3102
3103 MFMA = &MI;
3104 return true;
3105 };
3106
3107 MFMA = nullptr;
3108 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3109 MaxWarWaitStates);
3110 if (!MFMA)
3111 continue;
3112
3113 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3114 int NeedWaitStates = MaxWaitStates;
3115 switch (HazardDefLatency) {
3116 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3117 break;
3118 case 4: assert(ST.hasGFX940Insts());
3119 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3120 break;
3121 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3122 break;
3123 case 16: [[fallthrough]];
3124 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3125 break;
3126 }
3127
3128 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3129 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3130 }
3131
3132 return WaitStatesNeeded;
3133}
3134
3136 if (!SU->isInstr())
3137 return false;
3138
3139 const MachineInstr *MAI = nullptr;
3140
3141 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3142 MAI = nullptr;
3144 MAI = &MI;
3145 return MAI != nullptr;
3146 };
3147
3148 MachineInstr *MI = SU->getInstr();
3149 if (IsMFMAFn(*MI)) {
3150 int W = getWaitStatesSince(IsMFMAFn, 16);
3151 if (MAI)
3152 return W < (int)TSchedModel.computeInstrLatency(MAI);
3153 }
3154
3155 return false;
3156}
3157
3158// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3159// insertion of a new instruction.
3160static void updateGetPCBundle(MachineInstr *NewMI) {
3161 if (!NewMI->isBundled())
3162 return;
3163
3164 // Find start of bundle.
3165 auto I = NewMI->getIterator();
3166 while (I->isBundledWithPred())
3167 I--;
3168 if (I->isBundle())
3169 I++;
3170
3171 // Bail if this is not an S_GETPC bundle.
3172 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3173 return;
3174
3175 // Update offsets of any references in the bundle.
3176 const unsigned NewBytes = 4;
3177 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3178 "Unexpected instruction insertion in bundle");
3179 auto NextMI = std::next(NewMI->getIterator());
3180 auto End = NewMI->getParent()->end();
3181 while (NextMI != End && NextMI->isBundledWithPred()) {
3182 for (auto &Operand : NextMI->operands()) {
3183 if (Operand.isGlobal())
3184 Operand.setOffset(Operand.getOffset() + NewBytes);
3185 }
3186 NextMI++;
3187 }
3188}
3189
3190bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3191 if (!ST.hasVALUMaskWriteHazard())
3192 return false;
3194
3195 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
3196 return false;
3197
3198 // The hazard sequence is three instructions:
3199 // 1. VALU reads SGPR as mask
3200 // 2. SALU writes SGPR
3201 // 3. SALU reads SGPR
3202 // The hazard can expire if the distance between 2 and 3 is sufficient.
3203 // In practice this happens <10% of the time, hence this always assumes
3204 // the hazard exists if 1 and 2 are present to avoid searching.
3205
3206 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3207 if (!SDSTOp || !SDSTOp->isReg())
3208 return false;
3209
3210 const Register HazardReg = SDSTOp->getReg();
3211 if (HazardReg == AMDGPU::EXEC ||
3212 HazardReg == AMDGPU::EXEC_LO ||
3213 HazardReg == AMDGPU::EXEC_HI ||
3214 HazardReg == AMDGPU::M0)
3215 return false;
3216
3217 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3218 switch (I.getOpcode()) {
3219 case AMDGPU::V_ADDC_U32_e32:
3220 case AMDGPU::V_ADDC_U32_dpp:
3221 case AMDGPU::V_CNDMASK_B16_t16_e32:
3222 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3223 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3224 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3225 case AMDGPU::V_CNDMASK_B32_e32:
3226 case AMDGPU::V_CNDMASK_B32_dpp:
3227 case AMDGPU::V_DIV_FMAS_F32_e64:
3228 case AMDGPU::V_DIV_FMAS_F64_e64:
3229 case AMDGPU::V_SUBB_U32_e32:
3230 case AMDGPU::V_SUBB_U32_dpp:
3231 case AMDGPU::V_SUBBREV_U32_e32:
3232 case AMDGPU::V_SUBBREV_U32_dpp:
3233 // These implicitly read VCC as mask source.
3234 return HazardReg == AMDGPU::VCC ||
3235 HazardReg == AMDGPU::VCC_LO ||
3236 HazardReg == AMDGPU::VCC_HI;
3237 case AMDGPU::V_ADDC_U32_e64:
3238 case AMDGPU::V_ADDC_U32_e64_dpp:
3239 case AMDGPU::V_CNDMASK_B16_t16_e64:
3240 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3241 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3242 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3243 case AMDGPU::V_CNDMASK_B32_e64:
3244 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3245 case AMDGPU::V_SUBB_U32_e64:
3246 case AMDGPU::V_SUBB_U32_e64_dpp:
3247 case AMDGPU::V_SUBBREV_U32_e64:
3248 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3249 // Only check mask register overlaps.
3250 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3251 assert(SSRCOp);
3252 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3253 }
3254 default:
3255 return false;
3256 }
3257 };
3258
3259 const MachineRegisterInfo &MRI = MF.getRegInfo();
3260 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3261 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3262 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3263 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3264 return true;
3265
3266 // VALU access to any SGPR or literal constant other than HazardReg
3267 // mitigates hazard. No need to check HazardReg here as this will
3268 // only be called when !IsHazardFn.
3269 if (!SIInstrInfo::isVALU(I))
3270 return false;
3271 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3272 const MachineOperand &Op = I.getOperand(OpNo);
3273 if (Op.isReg()) {
3274 Register OpReg = Op.getReg();
3275 // Only consider uses
3276 if (!Op.isUse())
3277 continue;
3278 // Ignore EXEC
3279 if (OpReg == AMDGPU::EXEC ||
3280 OpReg == AMDGPU::EXEC_LO ||
3281 OpReg == AMDGPU::EXEC_HI)
3282 continue;
3283 // Ignore all implicit uses except VCC
3284 if (Op.isImplicit()) {
3285 if (OpReg == AMDGPU::VCC ||
3286 OpReg == AMDGPU::VCC_LO ||
3287 OpReg == AMDGPU::VCC_HI)
3288 return true;
3289 continue;
3290 }
3291 if (TRI.isSGPRReg(MRI, OpReg))
3292 return true;
3293 } else {
3294 const MCInstrDesc &InstDesc = I.getDesc();
3295 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3296 if (!TII.isInlineConstant(Op, OpInfo))
3297 return true;
3298 }
3299 }
3300 return false;
3301 };
3302
3303 // Check for hazard
3304 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3305 std::numeric_limits<int>::max())
3306 return false;
3307
3308 auto NextMI = std::next(MI->getIterator());
3309
3310 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3311 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3312 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3314
3315 // SALU write may be s_getpc in a bundle.
3316 updateGetPCBundle(NewMI);
3317
3318 return true;
3319}
3320
3321static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3322 const SIInstrInfo &TII) {
3323 MachineBasicBlock &EntryMBB = MF->front();
3324 if (EntryMBB.begin() != EntryMBB.end()) {
3325 auto &EntryMI = *EntryMBB.begin();
3326 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3327 EntryMI.getOperand(0).getImm() >= Priority)
3328 return false;
3329 }
3330
3331 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3332 .addImm(Priority);
3333 return true;
3334}
3335
3336bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3337 if (!ST.hasRequiredExportPriority())
3338 return false;
3339
3340 // Assume the following shader types will never have exports,
3341 // and avoid adding or adjusting S_SETPRIO.
3342 MachineBasicBlock *MBB = MI->getParent();
3343 MachineFunction *MF = MBB->getParent();
3344 auto CC = MF->getFunction().getCallingConv();
3345 switch (CC) {
3350 return false;
3351 default:
3352 break;
3353 }
3354
3355 const int MaxPriority = 3;
3356 const int NormalPriority = 2;
3357 const int PostExportPriority = 0;
3358
3359 auto It = MI->getIterator();
3360 switch (MI->getOpcode()) {
3361 case AMDGPU::S_ENDPGM:
3362 case AMDGPU::S_ENDPGM_SAVED:
3363 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3364 case AMDGPU::SI_RETURN_TO_EPILOG:
3365 // Ensure shader with calls raises priority at entry.
3366 // This ensures correct priority if exports exist in callee.
3367 if (MF->getFrameInfo().hasCalls())
3368 return ensureEntrySetPrio(MF, NormalPriority, TII);
3369 return false;
3370 case AMDGPU::S_SETPRIO: {
3371 // Raise minimum priority unless in workaround.
3372 auto &PrioOp = MI->getOperand(0);
3373 int Prio = PrioOp.getImm();
3374 bool InWA = (Prio == PostExportPriority) &&
3375 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3376 if (InWA || Prio >= NormalPriority)
3377 return false;
3378 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3379 return true;
3380 }
3381 default:
3382 if (!TII.isEXP(*MI))
3383 return false;
3384 break;
3385 }
3386
3387 // Check entry priority at each export (as there will only be a few).
3388 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3389 bool Changed = false;
3391 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3392
3393 auto NextMI = std::next(It);
3394 bool EndOfShader = false;
3395 if (NextMI != MBB->end()) {
3396 // Only need WA at end of sequence of exports.
3397 if (TII.isEXP(*NextMI))
3398 return Changed;
3399 // Assume appropriate S_SETPRIO after export means WA already applied.
3400 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3401 NextMI->getOperand(0).getImm() == PostExportPriority)
3402 return Changed;
3403 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3404 }
3405
3406 const DebugLoc &DL = MI->getDebugLoc();
3407
3408 // Lower priority.
3409 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3410 .addImm(PostExportPriority);
3411
3412 if (!EndOfShader) {
3413 // Wait for exports to complete.
3414 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3415 .addReg(AMDGPU::SGPR_NULL)
3416 .addImm(0);
3417 }
3418
3419 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3420 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3421
3422 if (!EndOfShader) {
3423 // Return to normal (higher) priority.
3424 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3425 .addImm(NormalPriority);
3426 }
3427
3428 return true;
3429}
3430
3431bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3432 if (!isSGetReg(MI->getOpcode()))
3433 return false;
3434
3435 const SIInstrInfo *TII = ST.getInstrInfo();
3436 switch (getHWReg(TII, *MI)) {
3437 default:
3438 return false;
3443 break;
3444 }
3445
3446 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3447 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3448 .addImm(0);
3449 return true;
3450}
3451
3452bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3453 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3454 return false;
3455
3456 const SIInstrInfo *TII = ST.getInstrInfo();
3457 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3458 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3459 .addImm(0xFFE3);
3460 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3461 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3462 .addImm(0xFFE3);
3463
3464 return true;
3465}
3466
3467bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3468 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3469 // for hazard to trigger.
3470 if (!IsHazardRecognizerMode)
3471 return false;
3472
3473 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3474 const SIInstrInfo *TII = ST.getInstrInfo();
3475 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3476 const int FlatScrBaseWaitStates = 10;
3477
3478 bool ReadsFlatScrLo =
3479 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3480 bool ReadsFlatScrHi =
3481 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3482 if (isSGetReg(MI->getOpcode())) {
3483 switch (getHWReg(TII, *MI)) {
3484 default:
3485 break;
3487 ReadsFlatScrLo = true;
3488 break;
3490 ReadsFlatScrHi = true;
3491 break;
3492 }
3493 }
3494
3495 const MachineRegisterInfo &MRI = MF.getRegInfo();
3496
3497 auto IsRegDefHazard = [&](Register Reg) -> bool {
3499 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3500 return MI.modifiesRegister(Reg, TRI);
3501 };
3502
3503 // This literally abuses the idea of waitstates. Instead of waitstates it
3504 // returns 1 for SGPR written and 0 otherwise.
3505 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3506 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3507 return 0;
3508 for (const MachineOperand &MO : MI.all_defs()) {
3509 if (TRI->isSGPRReg(MRI, MO.getReg()))
3510 return 1;
3511 }
3512 return 0;
3513 };
3514
3515 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3516 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3517 unsigned Wait = MI.getOperand(0).getImm();
3520 return true;
3521 }
3522 return SgprWrites >= FlatScrBaseWaitStates;
3523 };
3524
3525 return ::getWaitStatesSince(
3526 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3527 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3528 };
3529
3530 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3531 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3532 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3533 !IsRegDefHazard(AMDGPU::SGPR103)))
3534 return false;
3535
3536 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3537 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3540 return true;
3541}
3542
3543bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3544 if (!isSSetReg(MI->getOpcode()) ||
3545 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3546 return false;
3547
3548 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3549 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3550 return true;
3551}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
static const uint32_t IV[8]
Definition: blake3_impl.h:83
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878
bool hasCvtScaleForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool hasGFX950Insts() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:999
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:554
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasScratchBaseForwardingHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool requiresWaitIdleBeforeGetReg() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:656
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:539
bool hasRFEHazards() const
Definition: GCNSubtarget.h:549
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:545
bool isWave64() const
bool setRegModeNeedsVNOPs() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:240
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:86
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
Definition: MachineInstr.h:484
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:844
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:586
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:568
static bool isDGEMM(unsigned Opcode)
Definition: SIInstrInfo.h:889
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:701
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:440
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:836
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:857
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:873
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:891
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:820
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:650
static bool isWaitcnt(unsigned Opcode)
Definition: SIInstrInfo.h:1053
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:812
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:852
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:974
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:861
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:249
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:387
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:399
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:480
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
LLVM Value Representation.
Definition: Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:134
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ Entry
Definition: COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double e
Definition: MathExtras.h:47
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition: Threading.h:60
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:132
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:123
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:73
Definition: regcomp.c:186