41#define DEBUG_TYPE "si-insert-waitcnts"
44 "Force emit s_waitcnt expcnt(0) instrs");
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
48 "Force emit s_waitcnt vmcnt(0) instrs");
52 cl::desc(
"Force all waitcnt instrs to be emitted as "
53 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
57 "amdgpu-waitcnt-load-forcezero",
58 cl::desc(
"Force all waitcnt load counters to wait until 0"),
72 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
75 NUM_EXTENDED_INST_CNTS,
76 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
90auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
91 return enum_seq(LOAD_CNT, MaxCounter);
94using RegInterval = std::pair<int, int>;
96struct HardwareLimits {
100 unsigned StorecntMax;
101 unsigned SamplecntMax;
106struct RegisterEncoding {
116 VMEM_SAMPLER_READ_ACCESS,
117 VMEM_BVH_READ_ACCESS,
119 SCRATCH_WRITE_ACCESS,
139enum RegisterMapping {
140 SQ_MAX_PGM_VGPRS = 512,
142 SQ_MAX_PGM_SGPRS = 256,
150 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS,
171static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
172 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
173 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
174 AMDGPU::S_WAIT_KMCNT};
182static bool isNormalMode(InstCounterType MaxCounter) {
183 return MaxCounter == NUM_NORMAL_INST_CNTS;
188 assert(updateVMCntOnly(Inst));
191 return VMEM_NOSAMPLER;
198 return BaseInfo->
BVH ? VMEM_BVH
212 return Wait.StoreCnt;
214 return Wait.SampleCnt;
225 unsigned &WC = getCounterRef(
Wait,
T);
226 WC = std::min(WC, Count);
230 getCounterRef(
Wait,
T) = ~0
u;
234 return getCounterRef(
Wait,
T);
238InstCounterType eventCounter(
const unsigned *masks, WaitEventType E) {
239 for (
auto T : inst_counter_types()) {
240 if (masks[
T] & (1 << E))
254class WaitcntBrackets {
256 WaitcntBrackets(
const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
257 HardwareLimits Limits, RegisterEncoding Encoding,
258 const unsigned *WaitEventMaskForInst,
259 InstCounterType SmemAccessCounter)
260 :
ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
261 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
262 SmemAccessCounter(SmemAccessCounter) {}
264 unsigned getWaitCountMax(InstCounterType
T)
const {
267 return Limits.LoadcntMax;
269 return Limits.DscntMax;
271 return Limits.ExpcntMax;
273 return Limits.StorecntMax;
275 return Limits.SamplecntMax;
277 return Limits.BvhcntMax;
279 return Limits.KmcntMax;
286 unsigned getScoreLB(InstCounterType
T)
const {
291 unsigned getScoreUB(InstCounterType
T)
const {
296 unsigned getScoreRange(InstCounterType
T)
const {
297 return getScoreUB(
T) - getScoreLB(
T);
300 unsigned getRegScore(
int GprNo, InstCounterType
T)
const {
301 if (GprNo < NUM_ALL_VGPRS) {
302 return VgprScores[
T][GprNo];
304 assert(
T == SmemAccessCounter);
305 return SgprScores[GprNo - NUM_ALL_VGPRS];
315 bool counterOutOfOrder(InstCounterType
T)
const;
317 void simplifyWaitcnt(InstCounterType
T,
unsigned &Count)
const;
319 void determineWait(InstCounterType
T, RegInterval
Interval,
321 void determineWait(InstCounterType
T,
int RegNo,
323 determineWait(
T, {RegNo, RegNo + 1},
Wait);
327 void applyWaitcnt(InstCounterType
T,
unsigned Count);
332 unsigned hasPendingEvent()
const {
return PendingEvents; }
333 unsigned hasPendingEvent(WaitEventType E)
const {
334 return PendingEvents & (1 << E);
336 unsigned hasPendingEvent(InstCounterType
T)
const {
337 unsigned HasPending = PendingEvents & WaitEventMaskForInst[
T];
338 assert((HasPending != 0) == (getScoreRange(
T) != 0));
342 bool hasMixedPendingEvents(InstCounterType
T)
const {
343 unsigned Events = hasPendingEvent(
T);
345 return Events & (Events - 1);
348 bool hasPendingFlat()
const {
349 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
350 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
351 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
352 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
355 void setPendingFlat() {
356 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
357 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
362 bool hasOtherPendingVmemTypes(RegInterval
Interval, VmemType V)
const {
364 assert(RegNo < NUM_ALL_VGPRS);
365 if (VgprVmemTypes[RegNo] & ~(1 << V))
371 void clearVgprVmemTypes(RegInterval
Interval) {
373 assert(RegNo < NUM_ALL_VGPRS);
374 VgprVmemTypes[RegNo] = 0;
378 void setStateOnFunctionEntryOrReturn() {
379 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
380 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
397 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
398 unsigned OtherScore);
400 void setScoreLB(InstCounterType
T,
unsigned Val) {
405 void setScoreUB(InstCounterType
T,
unsigned Val) {
412 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
416 void setRegScore(
int GprNo, InstCounterType
T,
unsigned Val) {
417 setScoreByInterval({GprNo, GprNo + 1},
T, Val);
420 void setScoreByInterval(RegInterval
Interval, InstCounterType CntTy,
429 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
430 HardwareLimits Limits = {};
431 RegisterEncoding Encoding = {};
432 const unsigned *WaitEventMaskForInst;
433 InstCounterType SmemAccessCounter;
434 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
435 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
436 unsigned PendingEvents = 0;
438 unsigned LastFlat[NUM_INST_CNTS] = {0};
443 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
446 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
449 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
461class WaitcntGenerator {
466 InstCounterType MaxCounter;
470 WaitcntGenerator() =
default;
471 WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter)
479 bool isOptNone()
const {
return OptNone; }
493 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
508 virtual const unsigned *getWaitEventMask()
const = 0;
512 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
514 virtual ~WaitcntGenerator() =
default;
517 static constexpr unsigned
518 eventMask(std::initializer_list<WaitEventType> Events) {
520 for (
auto &E : Events)
527class WaitcntGeneratorPreGFX12 :
public WaitcntGenerator {
529 WaitcntGeneratorPreGFX12() =
default;
531 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
534 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
542 const unsigned *getWaitEventMask()
const override {
545 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
546 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
547 VMEM_BVH_READ_ACCESS}),
548 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
549 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
550 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
551 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
556 return WaitEventMaskForInstPreGFX12;
562class WaitcntGeneratorGFX12Plus :
public WaitcntGenerator {
564 WaitcntGeneratorGFX12Plus() =
default;
566 InstCounterType MaxCounter)
567 : WaitcntGenerator(MF, MaxCounter) {}
570 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
578 const unsigned *getWaitEventMask()
const override {
581 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
582 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
583 eventMask({LDS_ACCESS, GDS_ACCESS}),
584 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
585 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
586 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
587 eventMask({VMEM_SAMPLER_READ_ACCESS}),
588 eventMask({VMEM_BVH_READ_ACCESS}),
589 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
591 return WaitEventMaskForInstGFX12Plus;
611 std::unique_ptr<WaitcntBrackets>
Incoming;
615 InstCounterType SmemAccessCounter;
619 bool ForceEmitWaitcnt[NUM_INST_CNTS];
624 WaitcntGeneratorPreGFX12 WCGPreGFX12;
625 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
627 WaitcntGenerator *WCG =
nullptr;
633 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
639 (void)ForceExpCounter;
640 (void)ForceLgkmCounter;
641 (void)ForceVMCounter;
644 bool shouldFlushVmCnt(
MachineLoop *
ML, WaitcntBrackets &Brackets);
646 WaitcntBrackets &ScoreBrackets);
651 return "SI insert wait instructions";
663 bool isForceEmitWaitcnt()
const {
664 for (
auto T : inst_counter_types())
665 if (ForceEmitWaitcnt[
T])
670 void setForceEmitWaitcnt() {
676 ForceEmitWaitcnt[
EXP_CNT] =
true;
678 ForceEmitWaitcnt[
EXP_CNT] =
false;
683 ForceEmitWaitcnt[DS_CNT] =
true;
684 ForceEmitWaitcnt[KM_CNT] =
true;
686 ForceEmitWaitcnt[DS_CNT] =
false;
687 ForceEmitWaitcnt[KM_CNT] =
false;
692 ForceEmitWaitcnt[LOAD_CNT] =
true;
693 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
694 ForceEmitWaitcnt[BVH_CNT] =
true;
696 ForceEmitWaitcnt[LOAD_CNT] =
false;
697 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
698 ForceEmitWaitcnt[BVH_CNT] =
false;
705 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
707 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
708 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
720 return SCRATCH_WRITE_ACCESS;
721 return VMEM_WRITE_ACCESS;
724 return VMEM_READ_ACCESS;
725 return VmemReadMapping[getVmemType(Inst)];
732 WaitcntBrackets &ScoreBrackets,
740 WaitcntBrackets *ScoreBrackets);
742 WaitcntBrackets &ScoreBrackets);
747RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
751 if (!
TRI->isInAllocatableClass(
Op.getReg()))
763 if (
TRI->isVectorRegister(*
MRI,
Op.getReg())) {
764 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
767 Result.first += AGPR_OFFSET;
769 }
else if (
TRI->isSGPRReg(*
MRI,
Op.getReg())) {
770 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
771 Result.first =
Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
773 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
781 unsigned Size =
TRI->getRegSizeInBits(*RC);
787void WaitcntBrackets::setScoreByInterval(RegInterval
Interval,
788 InstCounterType CntTy,
791 if (RegNo < NUM_ALL_VGPRS) {
792 VgprUB = std::max(VgprUB, RegNo);
793 VgprScores[CntTy][RegNo] = Score;
795 assert(CntTy == SmemAccessCounter);
796 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
797 SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
806 InstCounterType CntTy,
unsigned Score) {
808 setScoreByInterval(
Interval, CntTy, Score);
815 InstCounterType
T = eventCounter(WaitEventMaskForInst, E);
817 unsigned UB = getScoreUB(
T);
818 unsigned CurrScore = UB + 1;
824 PendingEvents |= 1 << E;
825 setScoreUB(
T, CurrScore);
833 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
834 setScoreByOperand(&Inst,
TRI,
MRI, *AddrOp, EXP_CNT, CurrScore);
837 if (
const auto *Data0 =
838 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
839 setScoreByOperand(&Inst,
TRI,
MRI, *Data0, EXP_CNT, CurrScore);
840 if (
const auto *Data1 =
841 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
842 setScoreByOperand(&Inst,
TRI,
MRI, *Data1, EXP_CNT, CurrScore);
845 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
846 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
848 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
849 setScoreByOperand(&Inst,
TRI,
MRI,
Op, EXP_CNT, CurrScore);
852 }
else if (
TII->isFLAT(Inst)) {
854 setScoreByOperand(&Inst,
TRI,
MRI,
855 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
858 setScoreByOperand(&Inst,
TRI,
MRI,
859 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
862 }
else if (
TII->isMIMG(Inst)) {
867 setScoreByOperand(&Inst,
TRI,
MRI,
868 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
871 }
else if (
TII->isMTBUF(Inst)) {
875 }
else if (
TII->isMUBUF(Inst)) {
880 setScoreByOperand(&Inst,
TRI,
MRI,
881 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
884 }
else if (
TII->isLDSDIR(Inst)) {
886 setScoreByOperand(&Inst,
TRI,
MRI,
887 *
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
890 if (
TII->isEXP(Inst)) {
896 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
897 setScoreByOperand(&Inst,
TRI,
MRI, DefMO, EXP_CNT, CurrScore);
902 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
903 setScoreByOperand(&Inst,
TRI,
MRI,
Op, EXP_CNT, CurrScore);
918 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
919 if (
Interval.first >= NUM_ALL_VGPRS)
921 if (updateVMCntOnly(Inst)) {
926 VmemType
V = getVmemType(Inst);
928 VgprVmemTypes[RegNo] |= 1 <<
V;
931 setScoreByInterval(
Interval,
T, CurrScore);
934 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
939 if (!
MemOp->isStore() ||
944 auto AAI =
MemOp->getAAInfo();
952 if (!AAI || !AAI.Scope)
954 for (
unsigned I = 0, E = LDSDMAStores.size();
I != E && !Slot; ++
I) {
955 for (
const auto *
MemOp : LDSDMAStores[
I]->memoperands()) {
956 if (
MemOp->isStore() && AAI ==
MemOp->getAAInfo()) {
962 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
964 LDSDMAStores.push_back(&Inst);
965 Slot = LDSDMAStores.size();
968 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot,
T, CurrScore);
970 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS,
T, CurrScore);
977 for (
auto T : inst_counter_types(MaxCounter)) {
978 unsigned SR = getScoreRange(
T);
982 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
986 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
990 OS <<
" EXP_CNT(" << SR <<
"): ";
993 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
997 OS <<
" SAMPLE_CNT(" << SR <<
"): ";
1000 OS <<
" BVH_CNT(" << SR <<
"): ";
1003 OS <<
" KM_CNT(" << SR <<
"): ";
1006 OS <<
" UNKNOWN(" << SR <<
"): ";
1012 unsigned LB = getScoreLB(
T);
1014 for (
int J = 0; J <= VgprUB; J++) {
1015 unsigned RegScore = getRegScore(J,
T);
1018 unsigned RelScore = RegScore - LB - 1;
1019 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1020 OS << RelScore <<
":v" << J <<
" ";
1022 OS << RelScore <<
":ds ";
1026 if (
T == SmemAccessCounter) {
1027 for (
int J = 0; J <= SgprUB; J++) {
1028 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS,
T);
1031 unsigned RelScore = RegScore - LB - 1;
1032 OS << RelScore <<
":s" << J <<
" ";
1044 simplifyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1045 simplifyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1046 simplifyWaitcnt(DS_CNT,
Wait.DsCnt);
1047 simplifyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1048 simplifyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1049 simplifyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1050 simplifyWaitcnt(KM_CNT,
Wait.KmCnt);
1053void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1054 unsigned &Count)
const {
1058 if (Count >= getScoreRange(
T))
1062void WaitcntBrackets::determineWait(InstCounterType
T, RegInterval
Interval,
1064 const unsigned LB = getScoreLB(
T);
1065 const unsigned UB = getScoreUB(
T);
1067 unsigned ScoreToWait = getRegScore(RegNo,
T);
1071 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1072 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1073 !
ST->hasFlatLgkmVMemCountInOrder()) {
1077 addWait(
Wait,
T, 0);
1078 }
else if (counterOutOfOrder(
T)) {
1082 addWait(
Wait,
T, 0);
1086 unsigned NeededWait =
1087 std::min(UB - ScoreToWait, getWaitCountMax(
T) - 1);
1088 addWait(
Wait,
T, NeededWait);
1095 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1096 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1097 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1098 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1099 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1100 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1101 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1104void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1105 const unsigned UB = getScoreUB(
T);
1109 if (counterOutOfOrder(
T))
1111 setScoreLB(
T, std::max(getScoreLB(
T), UB - Count));
1114 PendingEvents &= ~WaitEventMaskForInst[
T];
1120bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1122 if (
T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1124 return hasMixedPendingEvents(
T);
1134char SIInsertWaitcnts::
ID = 0;
1139 return new SIInsertWaitcnts();
1144 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1149 if (NewEnc == MO.
getImm())
1160 case AMDGPU::S_WAIT_LOADCNT:
1162 case AMDGPU::S_WAIT_EXPCNT:
1164 case AMDGPU::S_WAIT_STORECNT:
1166 case AMDGPU::S_WAIT_SAMPLECNT:
1168 case AMDGPU::S_WAIT_BVHCNT:
1170 case AMDGPU::S_WAIT_DSCNT:
1172 case AMDGPU::S_WAIT_KMCNT:
1179bool WaitcntGenerator::promoteSoftWaitCnt(
MachineInstr *Waitcnt)
const {
1193bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1194 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
1197 assert(isNormalMode(MaxCounter));
1205 if (
II.isMetaInstruction())
1209 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1213 if (Opcode == AMDGPU::S_WAITCNT) {
1214 unsigned IEnc =
II.getOperand(0).getImm();
1217 ScoreBrackets.simplifyWaitcnt(OldWait);
1221 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1222 II.eraseFromParent();
1227 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1228 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1231 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1233 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1234 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1236 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1237 II.eraseFromParent();
1240 WaitcntVsCntInstr = &
II;
1247 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1249 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1250 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1251 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1258 <<
"applyPreexistingWaitcnt\n"
1259 <<
"New Instr at block end: " << *WaitcntInstr <<
'\n'
1260 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1261 <<
"Old Instr: " << *It
1262 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1265 if (WaitcntVsCntInstr) {
1267 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1268 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1270 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1271 Wait.StoreCnt = ~0
u;
1274 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1275 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1277 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1278 <<
"Old Instr: " << *It
1279 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1287bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1291 assert(isNormalMode(MaxCounter));
1298 if (
Wait.hasWaitExceptStoreCnt()) {
1300 [[maybe_unused]]
auto SWaitInst =
1305 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1306 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1309 if (
Wait.hasWaitStoreCnt()) {
1312 [[maybe_unused]]
auto SWaitInst =
1319 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1320 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1327WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1332WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1340bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1341 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
1344 assert(!isNormalMode(MaxCounter));
1353 if (
II.isMetaInstruction())
1362 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1366 if (Opcode == AMDGPU::S_WAITCNT)
1369 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1371 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1374 ScoreBrackets.simplifyWaitcnt(OldWait);
1376 UpdatableInstr = &CombinedLoadDsCntInstr;
1377 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1379 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1382 ScoreBrackets.simplifyWaitcnt(OldWait);
1384 UpdatableInstr = &CombinedStoreDsCntInstr;
1389 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1391 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1392 addWait(
Wait, CT.value(), OldCnt);
1393 UpdatableInstr = &WaitInstrs[CT.value()];
1397 if (!*UpdatableInstr) {
1398 *UpdatableInstr = &
II;
1400 II.eraseFromParent();
1405 if (CombinedLoadDsCntInstr) {
1413 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
1416 AMDGPU::OpName::simm16, NewEnc);
1417 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1418 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1419 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1424 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1425 <<
"New Instr at block end: "
1426 << *CombinedLoadDsCntInstr <<
'\n'
1427 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1428 <<
"Old Instr: " << *It <<
"New Instr: "
1429 << *CombinedLoadDsCntInstr <<
'\n');
1436 if (CombinedStoreDsCntInstr) {
1438 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
1441 AMDGPU::OpName::simm16, NewEnc);
1442 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1443 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1444 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1445 Wait.StoreCnt = ~0
u;
1449 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1450 <<
"New Instr at block end: "
1451 << *CombinedStoreDsCntInstr <<
'\n'
1452 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1453 <<
"Old Instr: " << *It <<
"New Instr: "
1454 << *CombinedStoreDsCntInstr <<
'\n');
1467 if (
Wait.DsCnt != ~0u) {
1476 if (
Wait.LoadCnt != ~0u) {
1477 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
1478 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1479 }
else if (
Wait.StoreCnt != ~0u) {
1480 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
1481 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1488 (*WI)->eraseFromParent();
1494 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1495 if (!WaitInstrs[CT])
1498 unsigned NewCnt = getWait(
Wait, CT);
1499 if (NewCnt != ~0u) {
1501 AMDGPU::OpName::simm16, NewCnt);
1502 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1504 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1505 setNoWait(
Wait, CT);
1508 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1509 <<
"New Instr at block end: " << *WaitInstrs[CT]
1511 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1512 <<
"Old Instr: " << *It
1513 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
1524bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1528 assert(!isNormalMode(MaxCounter));
1534 if (
Wait.DsCnt != ~0u) {
1537 if (
Wait.LoadCnt != ~0u) {
1545 }
else if (
Wait.StoreCnt != ~0u) {
1552 Wait.StoreCnt = ~0
u;
1560 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1561 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1568 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1569 unsigned Count = getWait(
Wait, CT);
1573 [[maybe_unused]]
auto SWaitInst =
1580 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1581 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1588 unsigned Opc =
MI.getOpcode();
1589 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1590 !
MI.getOperand(1).isUndef();
1618bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &
MI,
1619 WaitcntBrackets &ScoreBrackets,
1622 setForceEmitWaitcnt();
1624 if (
MI.isMetaInstruction())
1633 if (
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1634 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1635 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1636 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1637 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1644 if (
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1645 MI.getOpcode() == AMDGPU::SI_RETURN ||
1646 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1648 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
false));
1656 else if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
1657 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1659 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1660 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1664 else if ((
MI.getOpcode() == AMDGPU::S_SENDMSG ||
1665 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1666 ST->hasLegacyGeometry() &&
1677 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
1680 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1681 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1682 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1683 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1694 const auto &CallAddrOp = *
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1695 if (CallAddrOp.isReg()) {
1696 RegInterval CallAddrOpInterval =
1697 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, CallAddrOp);
1699 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1702 if (
const auto *RtnAddrOp =
1703 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
1704 RegInterval RtnAddrOpInterval =
1705 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, *RtnAddrOp);
1707 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1727 const Value *
Ptr = Memop->getValue();
1728 if (Memop->isStore() && SLoadAddresses.
count(
Ptr)) {
1729 addWait(
Wait, SmemAccessCounter, 0);
1733 unsigned AS = Memop->getAddrSpace();
1737 if (
TII->mayWriteLDSThroughDMA(
MI))
1741 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1742 bool FoundAliasingStore =
false;
1749 if (
Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1750 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1751 for (
unsigned I = 0, E = LDSDMAStores.size();
I != E; ++
I) {
1752 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
1753 FoundAliasingStore =
true;
1754 ScoreBrackets.determineWait(LOAD_CNT, RegNo +
I + 1,
Wait);
1758 if (!FoundAliasingStore)
1759 ScoreBrackets.determineWait(LOAD_CNT, RegNo,
Wait);
1760 if (Memop->isStore()) {
1761 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
1771 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
1776 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
1783 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
1790 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
1791 ScoreBrackets.hasOtherPendingVmemTypes(
Interval,
1793 !
ST->hasVmemWriteVgprInOrder()) {
1795 ScoreBrackets.determineWait(SAMPLE_CNT,
Interval,
Wait);
1797 ScoreBrackets.clearVgprVmemTypes(
Interval);
1799 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1804 ScoreBrackets.determineWait(SmemAccessCounter,
Interval,
Wait);
1814 if (
TII->isBarrierStart(
MI.getOpcode()) &&
1815 !
ST->hasAutoWaitcntBeforeBarrier() && !
ST->supportsBackOffBarrier()) {
1816 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
1823 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1829 ScoreBrackets.simplifyWaitcnt(
Wait);
1834 Wait = WCG->getAllZeroWaitcnt(
false);
1836 if (ForceEmitWaitcnt[LOAD_CNT])
1838 if (ForceEmitWaitcnt[EXP_CNT])
1840 if (ForceEmitWaitcnt[DS_CNT])
1842 if (ForceEmitWaitcnt[SAMPLE_CNT])
1844 if (ForceEmitWaitcnt[BVH_CNT])
1846 if (ForceEmitWaitcnt[KM_CNT])
1850 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1852 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1854 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1861 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
1868 WaitcntBrackets &ScoreBrackets,
1872 if (OldWaitcntInstr)
1876 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
1880 ScoreBrackets.applyWaitcnt(
Wait);
1883 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
1886 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1894 <<
"Update Instr: " << *It);
1897 if (WCG->createNewWaitcnt(
Block, It,
Wait))
1906bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const {
1914 if (
MI.memoperands_empty())
1923 unsigned AS = Memop->getAddrSpace();
1934bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const {
1938 if (!
TII->usesLGKM_CNT(
MI))
1942 if (
ST->isTgSplitEnabled())
1947 if (
MI.memoperands_empty())
1952 unsigned AS = Memop->getAddrSpace();
1962bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
1967 if (
TII->isFLATScratch(
MI))
1971 if (
TII->isFLATGlobal(
MI))
1976 if (
MI.memoperands_empty())
1981 unsigned AS = Memop->getAddrSpace();
1982 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
1988 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
1989 Opc == AMDGPU::GLOBAL_WBINV;
1992void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst,
1993 WaitcntBrackets *ScoreBrackets) {
1999 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2001 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2002 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
2003 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
2005 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2007 }
else if (
TII->isFLAT(Inst)) {
2014 int FlatASCount = 0;
2016 if (mayAccessVMEMThroughFlat(Inst)) {
2018 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2022 if (mayAccessLDSThroughFlat(Inst)) {
2024 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2033 if (FlatASCount > 1)
2034 ScoreBrackets->setPendingFlat();
2037 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2040 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2042 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
2044 }
else if (
TII->isSMRD(Inst)) {
2045 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2046 }
else if (Inst.
isCall()) {
2049 ScoreBrackets->applyWaitcnt(
2050 WCG->getAllZeroWaitcnt(
false));
2051 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2057 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_LDS_ACCESS, Inst);
2058 }
else if (
TII->isVINTERP(Inst)) {
2059 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2060 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2062 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2064 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
2066 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
2068 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
2071 case AMDGPU::S_SENDMSG:
2072 case AMDGPU::S_SENDMSG_RTN_B32:
2073 case AMDGPU::S_SENDMSG_RTN_B64:
2074 case AMDGPU::S_SENDMSGHALT:
2075 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
2077 case AMDGPU::S_MEMTIME:
2078 case AMDGPU::S_MEMREALTIME:
2079 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2080 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2081 case AMDGPU::S_BARRIER_LEAVE:
2082 case AMDGPU::S_GET_BARRIER_STATE_M0:
2083 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2084 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2090bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2091 unsigned OtherScore) {
2092 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2093 unsigned OtherShifted =
2094 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2095 Score = std::max(MyShifted, OtherShifted);
2096 return OtherShifted > MyShifted;
2104bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2105 bool StrictDom =
false;
2107 VgprUB = std::max(VgprUB,
Other.VgprUB);
2108 SgprUB = std::max(SgprUB,
Other.SgprUB);
2110 for (
auto T : inst_counter_types(MaxCounter)) {
2112 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
2113 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
2114 if (OtherEvents & ~OldEvents)
2116 PendingEvents |= OtherEvents;
2119 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2120 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2121 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2122 if (NewUB < ScoreLBs[
T])
2126 M.OldLB = ScoreLBs[
T];
2127 M.OtherLB =
Other.ScoreLBs[
T];
2128 M.MyShift = NewUB - ScoreUBs[
T];
2129 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2131 ScoreUBs[
T] = NewUB;
2133 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2135 for (
int J = 0; J <= VgprUB; J++)
2136 StrictDom |= mergeScore(M, VgprScores[
T][J],
Other.VgprScores[
T][J]);
2138 if (
T == SmemAccessCounter) {
2139 for (
int J = 0; J <= SgprUB; J++)
2140 StrictDom |= mergeScore(M, SgprScores[J],
Other.SgprScores[J]);
2144 for (
int J = 0; J <= VgprUB; J++) {
2145 unsigned char NewVmemTypes = VgprVmemTypes[J] |
Other.VgprVmemTypes[J];
2146 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2147 VgprVmemTypes[J] = NewVmemTypes;
2155 return Opcode == AMDGPU::S_WAITCNT ||
2158 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2159 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2166 WaitcntBrackets &ScoreBrackets) {
2170 dbgs() <<
"*** Block" <<
Block.getNumber() <<
" ***";
2171 ScoreBrackets.dump();
2177 bool VCCZCorrect =
true;
2178 if (
ST->hasReadVCCZBug()) {
2181 VCCZCorrect =
false;
2182 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2185 VCCZCorrect =
false;
2192 E =
Block.instr_end();
2199 if (!OldWaitcntInstr)
2200 OldWaitcntInstr = &Inst;
2205 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
2206 isPreheaderToFlush(
Block, ScoreBrackets);
2209 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2211 OldWaitcntInstr =
nullptr;
2214 bool RestoreVCCZ = !VCCZCorrect &&
readsVCCZ(Inst);
2217 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
2221 if (!
ST->partialVCCWritesUpdateVCCZ())
2222 VCCZCorrect =
false;
2231 if (
ST->hasReadVCCZBug() &&
2232 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2235 VCCZCorrect =
false;
2243 if (
TII->isSMRD(Inst)) {
2247 if (!Memop->isInvariant()) {
2248 const Value *
Ptr = Memop->getValue();
2252 if (
ST->hasReadVCCZBug()) {
2254 VCCZCorrect =
false;
2258 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2263 ScoreBrackets.simplifyWaitcnt(
Wait);
2265 ScoreBrackets,
nullptr);
2270 ScoreBrackets.dump();
2280 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2293 if (
Block.getFirstTerminator() ==
Block.end() &&
2294 isPreheaderToFlush(
Block, ScoreBrackets)) {
2295 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2297 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2299 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2313 WaitcntBrackets &ScoreBrackets) {
2314 auto [Iterator, IsInserted] = PreheadersToFlush.
try_emplace(&
MBB,
false);
2316 return Iterator->second;
2327 shouldFlushVmCnt(
Loop, ScoreBrackets)) {
2328 Iterator->second =
true;
2335bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
2349 WaitcntBrackets &Brackets) {
2350 bool HasVMemLoad =
false;
2351 bool HasVMemStore =
false;
2352 bool UsesVgprLoadedOutside =
false;
2358 if (isVMEMOrFlatVMEM(
MI)) {
2362 HasVMemStore =
true;
2365 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
2377 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2378 Brackets.getScoreLB(LOAD_CNT) ||
2379 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2380 Brackets.getScoreLB(SAMPLE_CNT) ||
2381 Brackets.getRegScore(RegNo, BVH_CNT) >
2382 Brackets.getScoreLB(BVH_CNT)) {
2383 UsesVgprLoadedOutside =
true;
2390 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
2404 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2406 return HasVMemLoad && UsesVgprLoadedOutside &&
ST->hasVmemWriteVgprInOrder();
2411 TII =
ST->getInstrInfo();
2412 TRI = &
TII->getRegisterInfo();
2415 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2416 PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2417 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2418 AA = &AAR->getAAResults();
2422 if (
ST->hasExtendedWaitCounts()) {
2423 MaxCounter = NUM_EXTENDED_INST_CNTS;
2424 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2425 WCG = &WCGGFX12Plus;
2427 MaxCounter = NUM_NORMAL_INST_CNTS;
2428 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2432 for (
auto T : inst_counter_types())
2433 ForceEmitWaitcnt[
T] =
false;
2435 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2437 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2439 HardwareLimits Limits = {};
2440 if (
ST->hasExtendedWaitCounts()) {
2453 unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs();
2454 unsigned NumSGPRsMax =
ST->getAddressableNumSGPRs();
2455 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2456 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2458 RegisterEncoding Encoding = {};
2461 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2464 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2480 I != E && (
I->isPHI() ||
I->isMetaInstruction()); ++
I)
2483 if (
ST->hasExtendedWaitCounts()) {
2486 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2487 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2491 TII->get(instrsForExtendedCounterTypes[CT]))
2498 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2499 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2501 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2502 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2512 std::unique_ptr<WaitcntBrackets> Brackets;
2517 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
2520 BlockInfo &BI = BII->second;
2526 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2528 *Brackets = *BI.Incoming;
2531 Brackets = std::make_unique<WaitcntBrackets>(
2532 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2535 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2536 WaitEventMaskForInst, SmemAccessCounter);
2539 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
2542 if (Brackets->hasPendingEvent()) {
2543 BlockInfo *MoveBracketsToSucc =
nullptr;
2545 auto *SuccBII = BlockInfos.
find(Succ);
2546 BlockInfo &SuccBI = SuccBII->second;
2547 if (!SuccBI.Incoming) {
2548 SuccBI.Dirty =
true;
2551 if (!MoveBracketsToSucc) {
2552 MoveBracketsToSucc = &SuccBI;
2554 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2556 }
else if (SuccBI.Incoming->merge(*Brackets)) {
2557 SuccBI.Dirty =
true;
2562 if (MoveBracketsToSucc)
2563 MoveBracketsToSucc->Incoming = std::move(Brackets);
2568 if (
ST->hasScalarStores()) {
2570 bool HaveScalarStores =
false;
2574 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
2575 HaveScalarStores =
true;
2577 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
2578 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2583 if (HaveScalarStores) {
2593 bool SeenDCacheWB =
false;
2597 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
2598 SeenDCacheWB =
true;
2599 else if (
TII->isScalarStore(*
I))
2600 SeenDCacheWB =
false;
2603 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
2604 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2618 if (!ReleaseVGPRInsts.
empty() &&
2619 (MF.getFrameInfo().hasCalls() ||
2620 ST->getOccupancyWithNumVGPRs(
2621 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass)) <
2624 if (
ST->requiresNopBeforeDeallocVGPRs()) {
2626 TII->get(AMDGPU::S_NOP))
2630 TII->get(AMDGPU::S_SENDMSG))
2635 ReleaseVGPRInsts.clear();
2636 PreheadersToFlush.
clear();
2637 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
std::optional< std::vector< StOtherPiece > > Other
static Function * getFunction(Constant *C)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isEntryFunction() const
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
iterator_range< filtered_mop_iterator > all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
CodeGenOptLevel
Code generation optimization level.
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable