46#define DEBUG_TYPE "si-insert-waitcnts"
49 "Force emit s_waitcnt expcnt(0) instrs");
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
53 "Force emit s_waitcnt vmcnt(0) instrs");
57 cl::desc(
"Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc(
"Force all waitcnt load counters to wait until 0"),
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
100using RegInterval = std::pair<int, int>;
102struct HardwareLimits {
106 unsigned StorecntMax;
107 unsigned SamplecntMax;
113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
115 DECL(VMEM_READ_ACCESS) \
116 DECL(VMEM_SAMPLER_READ_ACCESS) \
117 DECL(VMEM_BVH_READ_ACCESS) \
118 DECL(VMEM_WRITE_ACCESS) \
119 DECL(SCRATCH_WRITE_ACCESS) \
128 DECL(EXP_POS_ACCESS) \
129 DECL(EXP_PARAM_ACCESS) \
134#define AMDGPU_EVENT_ENUM(Name) Name,
139#undef AMDGPU_EVENT_ENUM
141#define AMDGPU_EVENT_NAME(Name) #Name,
145#undef AMDGPU_EVENT_NAME
154enum RegisterMapping {
155 SQ_MAX_PGM_VGPRS = 1024,
157 SQ_MAX_PGM_SGPRS = 128,
163 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS,
165 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS,
186static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
187 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
188 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
189 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
197static bool isNormalMode(InstCounterType MaxCounter) {
198 return MaxCounter == NUM_NORMAL_INST_CNTS;
203 assert(updateVMCntOnly(Inst));
205 return VMEM_NOSAMPLER;
219 return VMEM_NOSAMPLER;
231 return Wait.StoreCnt;
233 return Wait.SampleCnt;
246 unsigned &WC = getCounterRef(
Wait,
T);
247 WC = std::min(WC, Count);
251 getCounterRef(
Wait,
T) = ~0
u;
255 return getCounterRef(
Wait,
T);
259InstCounterType eventCounter(
const unsigned *masks, WaitEventType E) {
260 for (
auto T : inst_counter_types()) {
261 if (masks[
T] & (1 << E))
267class WaitcntBrackets;
275class WaitcntGenerator {
280 InstCounterType MaxCounter;
284 WaitcntGenerator() =
default;
285 WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter)
293 bool isOptNone()
const {
return OptNone; }
307 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
322 virtual const unsigned *getWaitEventMask()
const = 0;
326 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
328 virtual ~WaitcntGenerator() =
default;
331 static constexpr unsigned
332 eventMask(std::initializer_list<WaitEventType> Events) {
334 for (
auto &E : Events)
341class WaitcntGeneratorPreGFX12 :
public WaitcntGenerator {
343 WaitcntGeneratorPreGFX12() =
default;
345 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
348 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
356 const unsigned *getWaitEventMask()
const override {
359 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
360 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
361 VMEM_BVH_READ_ACCESS}),
362 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
363 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
364 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
365 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
371 return WaitEventMaskForInstPreGFX12;
377class WaitcntGeneratorGFX12Plus :
public WaitcntGenerator {
379 WaitcntGeneratorGFX12Plus() =
default;
381 InstCounterType MaxCounter)
382 : WaitcntGenerator(MF, MaxCounter) {}
385 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
393 const unsigned *getWaitEventMask()
const override {
396 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
397 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
398 eventMask({LDS_ACCESS, GDS_ACCESS}),
399 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
400 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
401 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
402 eventMask({VMEM_SAMPLER_READ_ACCESS}),
403 eventMask({VMEM_BVH_READ_ACCESS}),
404 eventMask({SMEM_ACCESS, SQ_MESSAGE}),
405 eventMask({VMEM_GROUP, SMEM_GROUP})};
407 return WaitEventMaskForInstGFX12Plus;
413class SIInsertWaitcnts {
416 InstCounterType SmemAccessCounter;
417 InstCounterType MaxCounter;
418 const unsigned *WaitEventMaskForInst;
432 std::unique_ptr<WaitcntBrackets>
Incoming;
438 bool ForceEmitWaitcnt[NUM_INST_CNTS];
443 WaitcntGeneratorPreGFX12 WCGPreGFX12;
444 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
446 WaitcntGenerator *WCG =
nullptr;
452 HardwareLimits Limits;
457 : MLI(MLI), PDT(PDT), AA(AA) {
458 (void)ForceExpCounter;
459 (void)ForceLgkmCounter;
460 (void)ForceVMCounter;
463 unsigned getWaitCountMax(InstCounterType
T)
const {
466 return Limits.LoadcntMax;
468 return Limits.DscntMax;
470 return Limits.ExpcntMax;
472 return Limits.StorecntMax;
474 return Limits.SamplecntMax;
476 return Limits.BvhcntMax;
478 return Limits.KmcntMax;
480 return Limits.XcntMax;
487 bool shouldFlushVmCnt(
MachineLoop *
ML,
const WaitcntBrackets &Brackets);
489 const WaitcntBrackets &ScoreBrackets);
493 bool isForceEmitWaitcnt()
const {
494 for (
auto T : inst_counter_types())
495 if (ForceEmitWaitcnt[
T])
500 void setForceEmitWaitcnt() {
506 ForceEmitWaitcnt[
EXP_CNT] =
true;
508 ForceEmitWaitcnt[
EXP_CNT] =
false;
513 ForceEmitWaitcnt[DS_CNT] =
true;
514 ForceEmitWaitcnt[KM_CNT] =
true;
516 ForceEmitWaitcnt[DS_CNT] =
false;
517 ForceEmitWaitcnt[KM_CNT] =
false;
522 ForceEmitWaitcnt[LOAD_CNT] =
true;
523 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
524 ForceEmitWaitcnt[BVH_CNT] =
true;
526 ForceEmitWaitcnt[LOAD_CNT] =
false;
527 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
528 ForceEmitWaitcnt[BVH_CNT] =
false;
535 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
537 case AMDGPU::GLOBAL_INV:
538 return VMEM_READ_ACCESS;
539 case AMDGPU::GLOBAL_WB:
540 case AMDGPU::GLOBAL_WBINV:
541 return VMEM_WRITE_ACCESS;
547 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
548 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
559 if (
TII->mayAccessScratchThroughFlat(Inst))
560 return SCRATCH_WRITE_ACCESS;
561 return VMEM_WRITE_ACCESS;
564 return VMEM_READ_ACCESS;
565 return VmemReadMapping[getVmemType(Inst)];
568 bool hasXcnt()
const {
return ST->hasWaitXCnt(); }
574 WaitcntBrackets &ScoreBrackets,
582 WaitcntBrackets *ScoreBrackets);
586 WaitcntBrackets &ScoreBrackets);
588 WaitcntBrackets &ScoreBrackets);
599class WaitcntBrackets {
601 WaitcntBrackets(
const SIInsertWaitcnts *Context) :
Context(
Context) {}
603 bool isSmemCounter(InstCounterType
T)
const {
604 return T ==
Context->SmemAccessCounter ||
T == X_CNT;
607 unsigned getSgprScoresIdx(InstCounterType
T)
const {
608 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
609 return T == X_CNT ? 1 : 0;
612 unsigned getScoreLB(InstCounterType
T)
const {
617 unsigned getScoreUB(InstCounterType
T)
const {
622 unsigned getScoreRange(InstCounterType
T)
const {
623 return getScoreUB(
T) - getScoreLB(
T);
626 unsigned getRegScore(
int GprNo, InstCounterType
T)
const {
627 if (GprNo < NUM_ALL_VGPRS)
628 return VgprScores[
T][GprNo];
629 return SgprScores[getSgprScoresIdx(
T)][GprNo - NUM_ALL_VGPRS];
639 bool counterOutOfOrder(InstCounterType
T)
const;
641 void simplifyWaitcnt(InstCounterType
T,
unsigned &Count)
const;
643 void determineWait(InstCounterType
T, RegInterval
Interval,
645 void determineWait(InstCounterType
T,
int RegNo,
647 determineWait(
T, {RegNo, RegNo + 1},
Wait);
651 void applyWaitcnt(InstCounterType
T,
unsigned Count);
657 unsigned hasPendingEvent()
const {
return PendingEvents; }
658 unsigned hasPendingEvent(WaitEventType E)
const {
659 return PendingEvents & (1 << E);
661 unsigned hasPendingEvent(InstCounterType
T)
const {
662 unsigned HasPending = PendingEvents &
Context->WaitEventMaskForInst[
T];
663 assert((HasPending != 0) == (getScoreRange(
T) != 0));
667 bool hasMixedPendingEvents(InstCounterType
T)
const {
668 unsigned Events = hasPendingEvent(
T);
670 return Events & (Events - 1);
673 bool hasPendingFlat()
const {
674 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
675 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
676 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
677 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
680 void setPendingFlat() {
681 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
682 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
685 bool hasPendingGDS()
const {
686 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
689 unsigned getPendingGDSWait()
const {
690 return std::min(getScoreUB(DS_CNT) - LastGDS,
691 Context->getWaitCountMax(DS_CNT) - 1);
694 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
698 bool hasOtherPendingVmemTypes(RegInterval
Interval, VmemType V)
const {
700 assert(RegNo < NUM_ALL_VGPRS);
701 if (VgprVmemTypes[RegNo] & ~(1 << V))
707 void clearVgprVmemTypes(RegInterval
Interval) {
709 assert(RegNo < NUM_ALL_VGPRS);
710 VgprVmemTypes[RegNo] = 0;
714 void setStateOnFunctionEntryOrReturn() {
715 setScoreUB(STORE_CNT,
716 getScoreUB(STORE_CNT) +
Context->getWaitCountMax(STORE_CNT));
717 PendingEvents |=
Context->WaitEventMaskForInst[STORE_CNT];
738 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
739 unsigned OtherScore);
741 void setScoreLB(InstCounterType
T,
unsigned Val) {
746 void setScoreUB(InstCounterType
T,
unsigned Val) {
753 if (getScoreRange(EXP_CNT) >
Context->getWaitCountMax(EXP_CNT))
757 void setRegScore(
int GprNo, InstCounterType
T,
unsigned Val) {
758 setScoreByInterval({GprNo, GprNo + 1},
T, Val);
761 void setScoreByInterval(RegInterval
Interval, InstCounterType CntTy,
769 const SIInsertWaitcnts *
Context;
771 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
772 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
773 unsigned PendingEvents = 0;
775 unsigned LastFlat[NUM_INST_CNTS] = {0};
777 unsigned LastGDS = 0;
782 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
787 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
790 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
804 return "SI insert wait instructions";
819RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
823 if (!
TRI->isInAllocatableClass(
Op.getReg()))
833 unsigned RegIdx =
TRI->getHWRegIndex(MCReg);
834 assert(isUInt<8>(RegIdx));
837 unsigned Size =
TRI->getRegSizeInBits(*RC);
840 if (
TRI->isVectorRegister(*
MRI,
Op.getReg())) {
842 assert(Reg < AGPR_OFFSET);
845 Result.first += AGPR_OFFSET;
849 }
else if (
TRI->isSGPRReg(*
MRI,
Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
852 Result.first = RegIdx + NUM_ALL_VGPRS;
861void WaitcntBrackets::setScoreByInterval(RegInterval
Interval,
862 InstCounterType CntTy,
865 if (RegNo < NUM_ALL_VGPRS) {
866 VgprUB = std::max(VgprUB, RegNo);
867 VgprScores[CntTy][RegNo] = Score;
869 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
870 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
879 InstCounterType CntTy,
unsigned Score) {
881 setScoreByInterval(
Interval, CntTy, Score);
889bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
904bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
906 if (!hasPointSampleAccel(
MI))
909 return hasOtherPendingVmemTypes(
Interval, VMEM_NOSAMPLER);
916 InstCounterType
T = eventCounter(
Context->WaitEventMaskForInst, E);
918 unsigned UB = getScoreUB(
T);
919 unsigned CurrScore = UB + 1;
925 PendingEvents |= 1 << E;
926 setScoreUB(
T, CurrScore);
934 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
935 setScoreByOperand(&Inst,
TRI,
MRI, *AddrOp, EXP_CNT, CurrScore);
938 if (
const auto *Data0 =
939 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
940 setScoreByOperand(&Inst,
TRI,
MRI, *Data0, EXP_CNT, CurrScore);
941 if (
const auto *Data1 =
942 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
943 setScoreByOperand(&Inst,
TRI,
MRI, *Data1, EXP_CNT, CurrScore);
946 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
947 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
949 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
950 setScoreByOperand(&Inst,
TRI,
MRI,
Op, EXP_CNT, CurrScore);
953 }
else if (
TII->isFLAT(Inst)) {
955 setScoreByOperand(&Inst,
TRI,
MRI,
956 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
959 setScoreByOperand(&Inst,
TRI,
MRI,
960 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
963 }
else if (
TII->isMIMG(Inst)) {
968 setScoreByOperand(&Inst,
TRI,
MRI,
969 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
972 }
else if (
TII->isMTBUF(Inst)) {
976 }
else if (
TII->isMUBUF(Inst)) {
981 setScoreByOperand(&Inst,
TRI,
MRI,
982 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
985 }
else if (
TII->isLDSDIR(Inst)) {
987 setScoreByOperand(&Inst,
TRI,
MRI,
988 *
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
991 if (
TII->isEXP(Inst)) {
997 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
998 setScoreByOperand(&Inst,
TRI,
MRI, DefMO, EXP_CNT, CurrScore);
1003 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1004 setScoreByOperand(&Inst,
TRI,
MRI,
Op, EXP_CNT, CurrScore);
1007 }
else if (
T == X_CNT) {
1009 setScoreByOperand(&Inst,
TRI,
MRI,
Op,
T, CurrScore);
1022 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
1023 if (
Interval.first >= NUM_ALL_VGPRS)
1025 if (updateVMCntOnly(Inst)) {
1030 VmemType
V = getVmemType(Inst);
1031 unsigned char TypesMask = 1 <<
V;
1034 if (hasPointSampleAccel(Inst))
1035 TypesMask |= 1 << VMEM_NOSAMPLER;
1037 VgprVmemTypes[RegNo] |= TypesMask;
1040 setScoreByInterval(
Interval,
T, CurrScore);
1043 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
1048 if (!
MemOp->isStore() ||
1053 auto AAI =
MemOp->getAAInfo();
1061 if (!AAI || !AAI.Scope)
1063 for (
unsigned I = 0, E = LDSDMAStores.size();
I != E && !Slot; ++
I) {
1064 for (
const auto *
MemOp : LDSDMAStores[
I]->memoperands()) {
1065 if (
MemOp->isStore() && AAI ==
MemOp->getAAInfo()) {
1071 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1073 LDSDMAStores.push_back(&Inst);
1074 Slot = LDSDMAStores.size();
1077 setRegScore(FIRST_LDS_VGPR + Slot,
T, CurrScore);
1079 setRegScore(FIRST_LDS_VGPR,
T, CurrScore);
1088 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
1089 unsigned SR = getScoreRange(
T);
1093 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1097 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1101 OS <<
" EXP_CNT(" << SR <<
"): ";
1104 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1108 OS <<
" SAMPLE_CNT(" << SR <<
"): ";
1111 OS <<
" BVH_CNT(" << SR <<
"): ";
1114 OS <<
" KM_CNT(" << SR <<
"): ";
1117 OS <<
" X_CNT(" << SR <<
"): ";
1120 OS <<
" UNKNOWN(" << SR <<
"): ";
1126 unsigned LB = getScoreLB(
T);
1128 for (
int J = 0; J <= VgprUB; J++) {
1129 unsigned RegScore = getRegScore(J,
T);
1132 unsigned RelScore = RegScore - LB - 1;
1133 if (J < FIRST_LDS_VGPR) {
1134 OS << RelScore <<
":v" << J <<
" ";
1136 OS << RelScore <<
":ds ";
1140 if (isSmemCounter(
T)) {
1141 for (
int J = 0; J <= SgprUB; J++) {
1142 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS,
T);
1145 unsigned RelScore = RegScore - LB - 1;
1146 OS << RelScore <<
":s" << J <<
" ";
1153 OS <<
"Pending Events: ";
1154 if (hasPendingEvent()) {
1156 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1157 if (hasPendingEvent((WaitEventType)
I)) {
1158 OS <<
LS << WaitEventTypeName[
I];
1172 simplifyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1173 simplifyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1174 simplifyWaitcnt(DS_CNT,
Wait.DsCnt);
1175 simplifyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1176 simplifyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1177 simplifyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1178 simplifyWaitcnt(KM_CNT,
Wait.KmCnt);
1179 simplifyWaitcnt(X_CNT,
Wait.XCnt);
1182void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1183 unsigned &Count)
const {
1187 if (Count >= getScoreRange(
T))
1191void WaitcntBrackets::determineWait(InstCounterType
T, RegInterval
Interval,
1193 const unsigned LB = getScoreLB(
T);
1194 const unsigned UB = getScoreUB(
T);
1196 unsigned ScoreToWait = getRegScore(RegNo,
T);
1200 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1201 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1202 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1206 addWait(
Wait,
T, 0);
1207 }
else if (counterOutOfOrder(
T)) {
1211 addWait(
Wait,
T, 0);
1215 unsigned NeededWait =
1216 std::min(UB - ScoreToWait,
Context->getWaitCountMax(
T) - 1);
1217 addWait(
Wait,
T, NeededWait);
1224 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1225 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1226 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1227 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1228 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1229 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1230 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1234void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1235 const unsigned UB = getScoreUB(
T);
1239 if (counterOutOfOrder(
T))
1241 setScoreLB(
T, std::max(getScoreLB(
T), UB - Count));
1244 PendingEvents &= ~Context->WaitEventMaskForInst[
T];
1252 if (
Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1253 return applyWaitcnt(X_CNT, 0);
1258 if (
Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1259 !hasPendingEvent(STORE_CNT))
1260 return applyWaitcnt(X_CNT, std::min(
Wait.XCnt,
Wait.LoadCnt));
1262 applyWaitcnt(X_CNT,
Wait.XCnt);
1267bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1269 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1270 (
T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1272 return hasMixedPendingEvents(
T);
1282char SIInsertWaitcntsLegacy::
ID = 0;
1287 return new SIInsertWaitcntsLegacy();
1292 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1297 if (NewEnc == MO.
getImm())
1308 case AMDGPU::S_WAIT_LOADCNT:
1310 case AMDGPU::S_WAIT_EXPCNT:
1312 case AMDGPU::S_WAIT_STORECNT:
1314 case AMDGPU::S_WAIT_SAMPLECNT:
1316 case AMDGPU::S_WAIT_BVHCNT:
1318 case AMDGPU::S_WAIT_DSCNT:
1320 case AMDGPU::S_WAIT_KMCNT:
1322 case AMDGPU::S_WAIT_XCNT:
1329bool WaitcntGenerator::promoteSoftWaitCnt(
MachineInstr *Waitcnt)
const {
1343bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1344 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
1347 assert(isNormalMode(MaxCounter));
1354 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1356 dbgs() <<
"end of block\n";
1364 if (
II.isMetaInstruction()) {
1370 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1374 if (Opcode == AMDGPU::S_WAITCNT) {
1375 unsigned IEnc =
II.getOperand(0).getImm();
1378 ScoreBrackets.simplifyWaitcnt(OldWait);
1382 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1383 II.eraseFromParent();
1387 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1390 <<
"Before: " <<
Wait.LoadCnt <<
'\n';);
1391 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR,
Wait);
1400 II.eraseFromParent();
1402 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1403 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1406 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1408 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1409 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1411 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1412 II.eraseFromParent();
1415 WaitcntVsCntInstr = &
II;
1422 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1424 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1425 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1426 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1433 <<
"applied pre-existing waitcnt\n"
1434 <<
"New Instr at block end: " << *WaitcntInstr <<
'\n'
1435 :
dbgs() <<
"applied pre-existing waitcnt\n"
1436 <<
"Old Instr: " << *It
1437 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1440 if (WaitcntVsCntInstr) {
1442 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1443 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1445 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1446 Wait.StoreCnt = ~0
u;
1449 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1450 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1452 :
dbgs() <<
"applied pre-existing waitcnt\n"
1453 <<
"Old Instr: " << *It
1454 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1462bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1466 assert(isNormalMode(MaxCounter));
1473 if (
Wait.hasWaitExceptStoreCnt()) {
1475 [[maybe_unused]]
auto SWaitInst =
1480 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1481 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1484 if (
Wait.hasWaitStoreCnt()) {
1487 [[maybe_unused]]
auto SWaitInst =
1494 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1495 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1502WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1507WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1516bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1517 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
1520 assert(!isNormalMode(MaxCounter));
1528 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1530 dbgs() <<
"end of block\n";
1538 if (
II.isMetaInstruction()) {
1549 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1553 if (Opcode == AMDGPU::S_WAITCNT)
1556 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1558 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1561 ScoreBrackets.simplifyWaitcnt(OldWait);
1563 UpdatableInstr = &CombinedLoadDsCntInstr;
1564 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1566 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1569 ScoreBrackets.simplifyWaitcnt(OldWait);
1571 UpdatableInstr = &CombinedStoreDsCntInstr;
1572 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1575 II.eraseFromParent();
1581 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1583 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1584 addWait(
Wait, CT.value(), OldCnt);
1585 UpdatableInstr = &WaitInstrs[CT.value()];
1589 if (!*UpdatableInstr) {
1590 *UpdatableInstr = &
II;
1592 II.eraseFromParent();
1597 if (CombinedLoadDsCntInstr) {
1605 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
1608 AMDGPU::OpName::simm16, NewEnc);
1609 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1610 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1611 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1616 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1617 <<
"New Instr at block end: "
1618 << *CombinedLoadDsCntInstr <<
'\n'
1619 :
dbgs() <<
"applied pre-existing waitcnt\n"
1620 <<
"Old Instr: " << *It <<
"New Instr: "
1621 << *CombinedLoadDsCntInstr <<
'\n');
1628 if (CombinedStoreDsCntInstr) {
1630 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
1633 AMDGPU::OpName::simm16, NewEnc);
1634 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1635 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1636 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1637 Wait.StoreCnt = ~0
u;
1641 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1642 <<
"New Instr at block end: "
1643 << *CombinedStoreDsCntInstr <<
'\n'
1644 :
dbgs() <<
"applied pre-existing waitcnt\n"
1645 <<
"Old Instr: " << *It <<
"New Instr: "
1646 << *CombinedStoreDsCntInstr <<
'\n');
1659 if (
Wait.DsCnt != ~0u) {
1668 if (
Wait.LoadCnt != ~0u) {
1669 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
1670 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1671 }
else if (
Wait.StoreCnt != ~0u) {
1672 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
1673 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1680 (*WI)->eraseFromParent();
1686 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1687 if (!WaitInstrs[CT])
1690 unsigned NewCnt = getWait(
Wait, CT);
1691 if (NewCnt != ~0u) {
1693 AMDGPU::OpName::simm16, NewCnt);
1694 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1696 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1697 setNoWait(
Wait, CT);
1700 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1701 <<
"New Instr at block end: " << *WaitInstrs[CT]
1703 :
dbgs() <<
"applied pre-existing waitcnt\n"
1704 <<
"Old Instr: " << *It
1705 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
1716bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1720 assert(!isNormalMode(MaxCounter));
1726 if (
Wait.DsCnt != ~0u) {
1729 if (
Wait.LoadCnt != ~0u) {
1737 }
else if (
Wait.StoreCnt != ~0u) {
1744 Wait.StoreCnt = ~0
u;
1752 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1753 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1760 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1761 unsigned Count = getWait(
Wait, CT);
1765 [[maybe_unused]]
auto SWaitInst =
1772 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1773 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1780 unsigned Opc =
MI.getOpcode();
1781 return (
Opc == AMDGPU::S_CBRANCH_VCCNZ ||
Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1782 !
MI.getOperand(1).isUndef();
1810bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &
MI,
1811 WaitcntBrackets &ScoreBrackets,
1814 setForceEmitWaitcnt();
1824 if (
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1825 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1826 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1827 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1828 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1835 if (
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1836 MI.getOpcode() == AMDGPU::SI_RETURN ||
1837 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1838 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1840 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
false));
1850 else if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
1851 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1852 if (!WCG->isOptNone() &&
1855 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1856 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1860 else if ((
MI.getOpcode() == AMDGPU::S_SENDMSG ||
1861 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1862 ST->hasLegacyGeometry() &&
1873 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
1876 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1877 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1878 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1879 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1886 if (
TII->isAlwaysGDS(
MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1887 addWait(
Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1895 const auto &CallAddrOp = *
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1896 if (CallAddrOp.isReg()) {
1897 RegInterval CallAddrOpInterval =
1898 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, CallAddrOp);
1900 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1903 if (
const auto *RtnAddrOp =
1904 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
1905 RegInterval RtnAddrOpInterval =
1906 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, *RtnAddrOp);
1908 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1928 const Value *
Ptr = Memop->getValue();
1929 if (Memop->isStore()) {
1930 if (
auto It = SLoadAddresses.
find(
Ptr); It != SLoadAddresses.
end()) {
1931 addWait(
Wait, SmemAccessCounter, 0);
1933 SLoadAddresses.
erase(It);
1936 unsigned AS = Memop->getAddrSpace();
1940 if (
TII->mayWriteLDSThroughDMA(
MI))
1944 unsigned RegNo = FIRST_LDS_VGPR;
1951 if (
Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1952 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1953 for (
unsigned I = 0, E = LDSDMAStores.size();
I != E; ++
I) {
1954 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true))
1955 ScoreBrackets.determineWait(LOAD_CNT, RegNo +
I + 1,
Wait);
1958 ScoreBrackets.determineWait(LOAD_CNT, RegNo,
Wait);
1960 if (Memop->isStore()) {
1961 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
1971 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
1976 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
1983 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
1992 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
1993 ScoreBrackets.hasOtherPendingVmemTypes(
Interval,
1995 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Interval) ||
1996 !
ST->hasVmemWriteVgprInOrder()) {
1998 ScoreBrackets.determineWait(SAMPLE_CNT,
Interval,
Wait);
2000 ScoreBrackets.clearVgprVmemTypes(
Interval);
2003 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2008 ScoreBrackets.determineWait(SmemAccessCounter,
Interval,
Wait);
2011 if (hasXcnt() &&
Op.isDef())
2029 if (
MI.getOpcode() == AMDGPU::S_BARRIER &&
2030 !
ST->hasAutoWaitcntBeforeBarrier() && !
ST->supportsBackOffBarrier()) {
2031 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2038 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2044 ScoreBrackets.simplifyWaitcnt(
Wait);
2049 Wait = WCG->getAllZeroWaitcnt(
false);
2051 if (ForceEmitWaitcnt[LOAD_CNT])
2053 if (ForceEmitWaitcnt[EXP_CNT])
2055 if (ForceEmitWaitcnt[DS_CNT])
2057 if (ForceEmitWaitcnt[SAMPLE_CNT])
2059 if (ForceEmitWaitcnt[BVH_CNT])
2061 if (ForceEmitWaitcnt[KM_CNT])
2063 if (ForceEmitWaitcnt[X_CNT])
2067 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2069 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2071 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2078 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2085 WaitcntBrackets &ScoreBrackets,
2089 if (OldWaitcntInstr)
2093 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2097 ScoreBrackets.applyWaitcnt(
Wait);
2100 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
2103 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2111 <<
"Update Instr: " << *It);
2115 if (
Wait.KmCnt == 0 &&
Wait.XCnt != ~0u &&
2116 !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2119 if (
Wait.LoadCnt == 0 &&
Wait.XCnt != ~0u &&
2120 !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2126 if (
Wait.XCnt != ~0u && isVmemAccess(*It))
2129 if (WCG->createNewWaitcnt(
Block, It,
Wait))
2138bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const {
2142 if (!
TII->usesVM_CNT(
MI))
2147 if (
MI.memoperands_empty())
2156 unsigned AS = Memop->getAddrSpace();
2167bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const {
2171 if (!
TII->usesLGKM_CNT(
MI))
2175 if (
ST->isTgSplitEnabled())
2180 if (
MI.memoperands_empty())
2185 unsigned AS = Memop->getAddrSpace();
2193bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2194 return (
TII->isFLAT(
MI) && mayAccessVMEMThroughFlat(
MI)) ||
2200 return Opc == AMDGPU::GLOBAL_INV ||
Opc == AMDGPU::GLOBAL_WB ||
2201 Opc == AMDGPU::GLOBAL_WBINV;
2208 auto BlockEnd =
Block->getParent()->end();
2209 auto BlockIter =
Block->getIterator();
2213 if (++BlockIter != BlockEnd) {
2214 It = BlockIter->instr_begin();
2221 if (!It->isMetaInstruction())
2229 return It->getOpcode() == AMDGPU::S_ENDPGM;
2233bool SIInsertWaitcnts::insertForcedWaitAfter(
MachineInstr &Inst,
2235 WaitcntBrackets &ScoreBrackets) {
2237 bool NeedsEndPGMCheck =
false;
2245 NeedsEndPGMCheck =
true;
2248 ScoreBrackets.simplifyWaitcnt(
Wait);
2251 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2254 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2262void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst,
2263 WaitcntBrackets *ScoreBrackets) {
2269 bool IsVMEMAccess =
false;
2270 bool IsSMEMAccess =
false;
2271 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2273 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2274 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
2275 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
2276 ScoreBrackets->setPendingGDS();
2278 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2280 }
else if (
TII->isFLAT(Inst)) {
2282 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2289 int FlatASCount = 0;
2291 if (mayAccessVMEMThroughFlat(Inst)) {
2293 IsVMEMAccess =
true;
2294 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2298 if (mayAccessLDSThroughFlat(Inst)) {
2300 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2306 if (FlatASCount > 1)
2307 ScoreBrackets->setPendingFlat();
2310 IsVMEMAccess =
true;
2311 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2314 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2316 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
2318 }
else if (
TII->isSMRD(Inst)) {
2319 IsSMEMAccess =
true;
2320 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2321 }
else if (Inst.
isCall()) {
2324 ScoreBrackets->applyWaitcnt(
2325 WCG->getAllZeroWaitcnt(
false));
2326 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2332 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_LDS_ACCESS, Inst);
2333 }
else if (
TII->isVINTERP(Inst)) {
2334 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2335 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2337 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2339 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
2341 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
2343 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
2346 case AMDGPU::S_SENDMSG:
2347 case AMDGPU::S_SENDMSG_RTN_B32:
2348 case AMDGPU::S_SENDMSG_RTN_B64:
2349 case AMDGPU::S_SENDMSGHALT:
2350 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
2352 case AMDGPU::S_MEMTIME:
2353 case AMDGPU::S_MEMREALTIME:
2354 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2355 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2356 case AMDGPU::S_BARRIER_LEAVE:
2357 case AMDGPU::S_GET_BARRIER_STATE_M0:
2358 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2359 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2368 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_GROUP, Inst);
2371 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_GROUP, Inst);
2374bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2375 unsigned OtherScore) {
2376 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2377 unsigned OtherShifted =
2378 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2379 Score = std::max(MyShifted, OtherShifted);
2380 return OtherShifted > MyShifted;
2388bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2389 bool StrictDom =
false;
2391 VgprUB = std::max(VgprUB,
Other.VgprUB);
2392 SgprUB = std::max(SgprUB,
Other.SgprUB);
2394 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
2396 const unsigned *WaitEventMaskForInst =
Context->WaitEventMaskForInst;
2397 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
2398 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
2399 if (OtherEvents & ~OldEvents)
2401 PendingEvents |= OtherEvents;
2404 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2405 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2406 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2407 if (NewUB < ScoreLBs[
T])
2411 M.OldLB = ScoreLBs[
T];
2412 M.OtherLB =
Other.ScoreLBs[
T];
2413 M.MyShift = NewUB - ScoreUBs[
T];
2414 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2416 ScoreUBs[
T] = NewUB;
2418 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2421 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2423 for (
int J = 0; J <= VgprUB; J++)
2424 StrictDom |= mergeScore(M, VgprScores[
T][J],
Other.VgprScores[
T][J]);
2426 if (isSmemCounter(
T)) {
2427 unsigned Idx = getSgprScoresIdx(
T);
2428 for (
int J = 0; J <= SgprUB; J++)
2430 mergeScore(M, SgprScores[
Idx][J],
Other.SgprScores[
Idx][J]);
2434 for (
int J = 0; J <= VgprUB; J++) {
2435 unsigned char NewVmemTypes = VgprVmemTypes[J] |
Other.VgprVmemTypes[J];
2436 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2437 VgprVmemTypes[J] = NewVmemTypes;
2445 return Opcode == AMDGPU::S_WAITCNT ||
2448 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2449 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2450 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2457 WaitcntBrackets &ScoreBrackets) {
2461 dbgs() <<
"*** Begin Block: ";
2463 ScoreBrackets.dump();
2469 bool VCCZCorrect =
true;
2470 if (
ST->hasReadVCCZBug()) {
2473 VCCZCorrect =
false;
2474 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2477 VCCZCorrect =
false;
2484 E =
Block.instr_end();
2495 if (!OldWaitcntInstr)
2496 OldWaitcntInstr = &Inst;
2501 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
2502 isPreheaderToFlush(
Block, ScoreBrackets);
2505 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2507 OldWaitcntInstr =
nullptr;
2510 bool RestoreVCCZ = !VCCZCorrect &&
readsVCCZ(Inst);
2513 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
2517 if (!
ST->partialVCCWritesUpdateVCCZ())
2518 VCCZCorrect =
false;
2527 if (
ST->hasReadVCCZBug() &&
2528 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2531 VCCZCorrect =
false;
2539 if (
TII->isSMRD(Inst)) {
2543 if (!Memop->isInvariant()) {
2544 const Value *
Ptr = Memop->getValue();
2548 if (
ST->hasReadVCCZBug()) {
2550 VCCZCorrect =
false;
2554 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2556 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
2560 ScoreBrackets.dump();
2570 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2583 if (
Block.getFirstTerminator() ==
Block.end() &&
2584 isPreheaderToFlush(
Block, ScoreBrackets)) {
2585 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2587 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2589 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2598 dbgs() <<
"*** End Block: ";
2600 ScoreBrackets.dump();
2608bool SIInsertWaitcnts::isPreheaderToFlush(
2610 auto [Iterator, IsInserted] = PreheadersToFlush.
try_emplace(&
MBB,
false);
2612 return Iterator->second;
2623 shouldFlushVmCnt(
Loop, ScoreBrackets)) {
2624 Iterator->second =
true;
2631bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
2633 return mayAccessVMEMThroughFlat(
MI);
2646 const WaitcntBrackets &Brackets) {
2647 bool HasVMemLoad =
false;
2648 bool HasVMemStore =
false;
2649 bool UsesVgprLoadedOutside =
false;
2655 if (isVMEMOrFlatVMEM(
MI)) {
2659 HasVMemStore =
true;
2662 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
2674 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2675 Brackets.getScoreLB(LOAD_CNT) ||
2676 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2677 Brackets.getScoreLB(SAMPLE_CNT) ||
2678 Brackets.getRegScore(RegNo, BVH_CNT) >
2679 Brackets.getScoreLB(BVH_CNT)) {
2680 UsesVgprLoadedOutside =
true;
2687 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
2701 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2703 return HasVMemLoad && UsesVgprLoadedOutside &&
ST->hasVmemWriteVgprInOrder();
2706bool SIInsertWaitcntsLegacy::runOnMachineFunction(
MachineFunction &MF) {
2707 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2709 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2711 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2712 AA = &AAR->getAAResults();
2714 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2726 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2731 .preserve<AAManager>();
2736 TII = ST->getInstrInfo();
2737 TRI = &
TII->getRegisterInfo();
2743 if (ST->hasExtendedWaitCounts()) {
2744 MaxCounter = NUM_EXTENDED_INST_CNTS;
2745 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2746 WCG = &WCGGFX12Plus;
2748 MaxCounter = NUM_NORMAL_INST_CNTS;
2749 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2753 for (
auto T : inst_counter_types())
2754 ForceEmitWaitcnt[
T] =
false;
2756 WaitEventMaskForInst = WCG->getWaitEventMask();
2758 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2760 if (
ST->hasExtendedWaitCounts()) {
2774 [[maybe_unused]]
unsigned NumVGPRsMax =
2776 [[maybe_unused]]
unsigned NumSGPRsMax =
ST->getAddressableNumSGPRs();
2777 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2778 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2794 I != E && (
I->isPHI() ||
I->isMetaInstruction()); ++
I)
2797 if (
ST->hasExtendedWaitCounts()) {
2800 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2801 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2804 if (!
ST->hasImageInsts() &&
2805 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2809 TII->get(instrsForExtendedCounterTypes[CT]))
2816 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
2817 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2818 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2828 std::unique_ptr<WaitcntBrackets> Brackets;
2833 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
2836 BlockInfo &BI = BII->second;
2842 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2844 *Brackets = *BI.Incoming;
2847 Brackets = std::make_unique<WaitcntBrackets>(
this);
2852 Brackets->~WaitcntBrackets();
2853 new (Brackets.get()) WaitcntBrackets(
this);
2857 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
2860 if (Brackets->hasPendingEvent()) {
2861 BlockInfo *MoveBracketsToSucc =
nullptr;
2863 auto *SuccBII = BlockInfos.
find(Succ);
2864 BlockInfo &SuccBI = SuccBII->second;
2865 if (!SuccBI.Incoming) {
2866 SuccBI.Dirty =
true;
2867 if (SuccBII <= BII) {
2871 if (!MoveBracketsToSucc) {
2872 MoveBracketsToSucc = &SuccBI;
2874 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2876 }
else if (SuccBI.Incoming->merge(*Brackets)) {
2877 SuccBI.Dirty =
true;
2878 if (SuccBII <= BII) {
2884 if (MoveBracketsToSucc)
2885 MoveBracketsToSucc->Incoming = std::move(Brackets);
2890 if (
ST->hasScalarStores()) {
2892 bool HaveScalarStores =
false;
2896 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
2897 HaveScalarStores =
true;
2899 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
2900 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2905 if (HaveScalarStores) {
2915 bool SeenDCacheWB =
false;
2919 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
2920 SeenDCacheWB =
true;
2921 else if (
TII->isScalarStore(*
I))
2922 SeenDCacheWB =
false;
2925 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
2926 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2944 TII->get(AMDGPU::S_ALLOC_VGPR))
2949 if (!ReleaseVGPRInsts.empty() &&
2950 (MF.getFrameInfo().hasCalls() ||
2951 ST->getOccupancyWithNumVGPRs(
2952 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
2956 if (
ST->requiresNopBeforeDeallocVGPRs()) {
2958 TII->get(AMDGPU::S_NOP))
2962 TII->get(AMDGPU::S_SENDMSG))
2968 ReleaseVGPRInsts.clear();
2969 PreheadersToFlush.
clear();
2970 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
bool isEntryFunction() const
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getDynamicVGPRBlockSize() const
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
CodeGenOptLevel
Code generation optimization level.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable