LLVM 21.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39using namespace llvm;
40
41#define DEBUG_TYPE "si-insert-waitcnts"
42
43DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
44 "Force emit s_waitcnt expcnt(0) instrs");
45DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
47DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
48 "Force emit s_waitcnt vmcnt(0) instrs");
49
50static cl::opt<bool>
51 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
52 cl::desc("Force all waitcnt instrs to be emitted as "
53 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
57 "amdgpu-waitcnt-load-forcezero",
58 cl::desc("Force all waitcnt load counters to wait until 0"),
59 cl::init(false), cl::Hidden);
60
61namespace {
62// Class of object that encapsulates latest instruction counter score
63// associated with the operand. Used for determining whether
64// s_waitcnt instruction needs to be emitted.
65
66enum InstCounterType {
67 LOAD_CNT = 0, // VMcnt prior to gfx12.
68 DS_CNT, // LKGMcnt prior to gfx12.
69 EXP_CNT, //
70 STORE_CNT, // VScnt in gfx10/gfx11.
71 NUM_NORMAL_INST_CNTS,
72 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
73 BVH_CNT, // gfx12+ only.
74 KM_CNT, // gfx12+ only.
75 NUM_EXTENDED_INST_CNTS,
76 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
77};
78} // namespace
79
80namespace llvm {
81template <> struct enum_iteration_traits<InstCounterType> {
82 static constexpr bool is_iterable = true;
83};
84} // namespace llvm
85
86namespace {
87// Return an iterator over all counters between LOAD_CNT (the first counter)
88// and \c MaxCounter (exclusive, default value yields an enumeration over
89// all counters).
90auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
91 return enum_seq(LOAD_CNT, MaxCounter);
92}
93
94using RegInterval = std::pair<int, int>;
95
96struct HardwareLimits {
97 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
98 unsigned ExpcntMax;
99 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
100 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
101 unsigned SamplecntMax; // gfx12+ only.
102 unsigned BvhcntMax; // gfx12+ only.
103 unsigned KmcntMax; // gfx12+ only.
104};
105
106struct RegisterEncoding {
107 unsigned VGPR0;
108 unsigned VGPRL;
109 unsigned SGPR0;
110 unsigned SGPRL;
111};
112
113enum WaitEventType {
114 VMEM_ACCESS, // vector-memory read & write
115 VMEM_READ_ACCESS, // vector-memory read
116 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
117 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
118 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
119 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
120 LDS_ACCESS, // lds read & write
121 GDS_ACCESS, // gds read & write
122 SQ_MESSAGE, // send message
123 SMEM_ACCESS, // scalar-memory read & write
124 EXP_GPR_LOCK, // export holding on its data src
125 GDS_GPR_LOCK, // GDS holding on its data and addr src
126 EXP_POS_ACCESS, // write to export position
127 EXP_PARAM_ACCESS, // write to export parameter
128 VMW_GPR_LOCK, // vector-memory write holding on its data src
129 EXP_LDS_ACCESS, // read by ldsdir counting as export
130 NUM_WAIT_EVENTS,
131};
132
133// The mapping is:
134// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
135// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
136// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
137// We reserve a fixed number of VGPR slots in the scoring tables for
138// special tokens like SCMEM_LDS (needed for buffer load to LDS).
139enum RegisterMapping {
140 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
141 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
142 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
143 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
144 // Artificial register slots to track LDS writes into specific LDS locations
145 // if a location is known. When slots are exhausted or location is
146 // unknown use the first slot. The first slot is also always updated in
147 // addition to known location's slot to properly generate waits if dependent
148 // instruction's location is unknown.
149 EXTRA_VGPR_LDS = 0,
150 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
151};
152
153// Enumerate different types of result-returning VMEM operations. Although
154// s_waitcnt orders them all with a single vmcnt counter, in the absence of
155// s_waitcnt only instructions of the same VmemType are guaranteed to write
156// their results in order -- so there is no need to insert an s_waitcnt between
157// two instructions of the same type that write the same vgpr.
158enum VmemType {
159 // BUF instructions and MIMG instructions without a sampler.
160 VMEM_NOSAMPLER,
161 // MIMG instructions with a sampler.
162 VMEM_SAMPLER,
163 // BVH instructions
164 VMEM_BVH,
165 NUM_VMEM_TYPES
166};
167
168// Maps values of InstCounterType to the instruction that waits on that
169// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
170// returns true.
171static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
172 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
173 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
174 AMDGPU::S_WAIT_KMCNT};
175
176static bool updateVMCntOnly(const MachineInstr &Inst) {
177 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
179}
180
181#ifndef NDEBUG
182static bool isNormalMode(InstCounterType MaxCounter) {
183 return MaxCounter == NUM_NORMAL_INST_CNTS;
184}
185#endif // NDEBUG
186
187VmemType getVmemType(const MachineInstr &Inst) {
188 assert(updateVMCntOnly(Inst));
189 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
191 return VMEM_NOSAMPLER;
193 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
195 // We have to make an additional check for isVSAMPLE here since some
196 // instructions don't have a sampler, but are still classified as sampler
197 // instructions for the purposes of e.g. waitcnt.
198 return BaseInfo->BVH ? VMEM_BVH
199 : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
200 : VMEM_NOSAMPLER;
201}
202
203unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
204 switch (T) {
205 case LOAD_CNT:
206 return Wait.LoadCnt;
207 case EXP_CNT:
208 return Wait.ExpCnt;
209 case DS_CNT:
210 return Wait.DsCnt;
211 case STORE_CNT:
212 return Wait.StoreCnt;
213 case SAMPLE_CNT:
214 return Wait.SampleCnt;
215 case BVH_CNT:
216 return Wait.BvhCnt;
217 case KM_CNT:
218 return Wait.KmCnt;
219 default:
220 llvm_unreachable("bad InstCounterType");
221 }
222}
223
224void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
225 unsigned &WC = getCounterRef(Wait, T);
226 WC = std::min(WC, Count);
227}
228
229void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
230 getCounterRef(Wait, T) = ~0u;
231}
232
233unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
234 return getCounterRef(Wait, T);
235}
236
237// Mapping from event to counter according to the table masks.
238InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
239 for (auto T : inst_counter_types()) {
240 if (masks[T] & (1 << E))
241 return T;
242 }
243 llvm_unreachable("event type has no associated counter");
244}
245
246// This objects maintains the current score brackets of each wait counter, and
247// a per-register scoreboard for each wait counter.
248//
249// We also maintain the latest score for every event type that can change the
250// waitcnt in order to know if there are multiple types of events within
251// the brackets. When multiple types of event happen in the bracket,
252// wait count may get decreased out of order, therefore we need to put in
253// "s_waitcnt 0" before use.
254class WaitcntBrackets {
255public:
256 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
257 HardwareLimits Limits, RegisterEncoding Encoding,
258 const unsigned *WaitEventMaskForInst,
259 InstCounterType SmemAccessCounter)
260 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
261 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
262 SmemAccessCounter(SmemAccessCounter) {}
263
264 unsigned getWaitCountMax(InstCounterType T) const {
265 switch (T) {
266 case LOAD_CNT:
267 return Limits.LoadcntMax;
268 case DS_CNT:
269 return Limits.DscntMax;
270 case EXP_CNT:
271 return Limits.ExpcntMax;
272 case STORE_CNT:
273 return Limits.StorecntMax;
274 case SAMPLE_CNT:
275 return Limits.SamplecntMax;
276 case BVH_CNT:
277 return Limits.BvhcntMax;
278 case KM_CNT:
279 return Limits.KmcntMax;
280 default:
281 break;
282 }
283 return 0;
284 }
285
286 unsigned getScoreLB(InstCounterType T) const {
287 assert(T < NUM_INST_CNTS);
288 return ScoreLBs[T];
289 }
290
291 unsigned getScoreUB(InstCounterType T) const {
292 assert(T < NUM_INST_CNTS);
293 return ScoreUBs[T];
294 }
295
296 unsigned getScoreRange(InstCounterType T) const {
297 return getScoreUB(T) - getScoreLB(T);
298 }
299
300 unsigned getRegScore(int GprNo, InstCounterType T) const {
301 if (GprNo < NUM_ALL_VGPRS) {
302 return VgprScores[T][GprNo];
303 }
304 assert(T == SmemAccessCounter);
305 return SgprScores[GprNo - NUM_ALL_VGPRS];
306 }
307
308 bool merge(const WaitcntBrackets &Other);
309
310 RegInterval getRegInterval(const MachineInstr *MI,
312 const SIRegisterInfo *TRI,
313 const MachineOperand &Op) const;
314
315 bool counterOutOfOrder(InstCounterType T) const;
316 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
317 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
318
319 void determineWait(InstCounterType T, RegInterval Interval,
320 AMDGPU::Waitcnt &Wait) const;
321 void determineWait(InstCounterType T, int RegNo,
322 AMDGPU::Waitcnt &Wait) const {
323 determineWait(T, {RegNo, RegNo + 1}, Wait);
324 }
325
326 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
327 void applyWaitcnt(InstCounterType T, unsigned Count);
328 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
329 const MachineRegisterInfo *MRI, WaitEventType E,
331
332 unsigned hasPendingEvent() const { return PendingEvents; }
333 unsigned hasPendingEvent(WaitEventType E) const {
334 return PendingEvents & (1 << E);
335 }
336 unsigned hasPendingEvent(InstCounterType T) const {
337 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
338 assert((HasPending != 0) == (getScoreRange(T) != 0));
339 return HasPending;
340 }
341
342 bool hasMixedPendingEvents(InstCounterType T) const {
343 unsigned Events = hasPendingEvent(T);
344 // Return true if more than one bit is set in Events.
345 return Events & (Events - 1);
346 }
347
348 bool hasPendingFlat() const {
349 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
350 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
351 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
352 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
353 }
354
355 void setPendingFlat() {
356 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
357 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
358 }
359
360 // Return true if there might be pending writes to the vgpr-interval by VMEM
361 // instructions with types different from V.
362 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
363 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
364 assert(RegNo < NUM_ALL_VGPRS);
365 if (VgprVmemTypes[RegNo] & ~(1 << V))
366 return true;
367 }
368 return false;
369 }
370
371 void clearVgprVmemTypes(RegInterval Interval) {
372 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
373 assert(RegNo < NUM_ALL_VGPRS);
374 VgprVmemTypes[RegNo] = 0;
375 }
376 }
377
378 void setStateOnFunctionEntryOrReturn() {
379 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
380 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
381 }
382
383 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
384 return LDSDMAStores;
385 }
386
387 void print(raw_ostream &);
388 void dump() { print(dbgs()); }
389
390private:
391 struct MergeInfo {
392 unsigned OldLB;
393 unsigned OtherLB;
394 unsigned MyShift;
395 unsigned OtherShift;
396 };
397 static bool mergeScore(const MergeInfo &M, unsigned &Score,
398 unsigned OtherScore);
399
400 void setScoreLB(InstCounterType T, unsigned Val) {
401 assert(T < NUM_INST_CNTS);
402 ScoreLBs[T] = Val;
403 }
404
405 void setScoreUB(InstCounterType T, unsigned Val) {
406 assert(T < NUM_INST_CNTS);
407 ScoreUBs[T] = Val;
408
409 if (T != EXP_CNT)
410 return;
411
412 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
413 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
414 }
415
416 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
417 setScoreByInterval({GprNo, GprNo + 1}, T, Val);
418 }
419
420 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
421 unsigned Score);
422
423 void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
425 const MachineOperand &Op, InstCounterType CntTy,
426 unsigned Val);
427
428 const GCNSubtarget *ST = nullptr;
429 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
430 HardwareLimits Limits = {};
431 RegisterEncoding Encoding = {};
432 const unsigned *WaitEventMaskForInst;
433 InstCounterType SmemAccessCounter;
434 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
435 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
436 unsigned PendingEvents = 0;
437 // Remember the last flat memory operation.
438 unsigned LastFlat[NUM_INST_CNTS] = {0};
439 // wait_cnt scores for every vgpr.
440 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
441 int VgprUB = -1;
442 int SgprUB = -1;
443 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
444 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
445 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
446 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
447 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
448 // write to each vgpr.
449 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
450 // Store representative LDS DMA operations. The only useful info here is
451 // alias info. One store is kept per unique AAInfo.
452 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
453};
454
455// This abstracts the logic for generating and updating S_WAIT* instructions
456// away from the analysis that determines where they are needed. This was
457// done because the set of counters and instructions for waiting on them
458// underwent a major shift with gfx12, sufficiently so that having this
459// abstraction allows the main analysis logic to be simpler than it would
460// otherwise have had to become.
461class WaitcntGenerator {
462protected:
463 const GCNSubtarget *ST = nullptr;
464 const SIInstrInfo *TII = nullptr;
466 InstCounterType MaxCounter;
467 bool OptNone;
468
469public:
470 WaitcntGenerator() = default;
471 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
472 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
473 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
474 OptNone(MF.getFunction().hasOptNone() ||
475 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
476
477 // Return true if the current function should be compiled with no
478 // optimization.
479 bool isOptNone() const { return OptNone; }
480
481 // Edits an existing sequence of wait count instructions according
482 // to an incoming Waitcnt value, which is itself updated to reflect
483 // any new wait count instructions which may need to be generated by
484 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
485 // were made.
486 //
487 // This editing will usually be merely updated operands, but it may also
488 // delete instructions if the incoming Wait value indicates they are not
489 // needed. It may also remove existing instructions for which a wait
490 // is needed if it can be determined that it is better to generate new
491 // instructions later, as can happen on gfx12.
492 virtual bool
493 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
494 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
496
497 // Transform a soft waitcnt into a normal one.
498 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
499
500 // Generates new wait count instructions according to the value of
501 // Wait, returning true if any new instructions were created.
502 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
505
506 // Returns an array of bit masks which can be used to map values in
507 // WaitEventType to corresponding counter values in InstCounterType.
508 virtual const unsigned *getWaitEventMask() const = 0;
509
510 // Returns a new waitcnt with all counters except VScnt set to 0. If
511 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
512 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
513
514 virtual ~WaitcntGenerator() = default;
515
516 // Create a mask value from the initializer list of wait event types.
517 static constexpr unsigned
518 eventMask(std::initializer_list<WaitEventType> Events) {
519 unsigned Mask = 0;
520 for (auto &E : Events)
521 Mask |= 1 << E;
522
523 return Mask;
524 }
525};
526
527class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
528public:
529 WaitcntGeneratorPreGFX12() = default;
530 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
531 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
532
533 bool
534 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
535 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
536 MachineBasicBlock::instr_iterator It) const override;
537
538 bool createNewWaitcnt(MachineBasicBlock &Block,
540 AMDGPU::Waitcnt Wait) override;
541
542 const unsigned *getWaitEventMask() const override {
543 assert(ST);
544
545 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
546 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
547 VMEM_BVH_READ_ACCESS}),
548 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
549 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
550 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
551 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
552 0,
553 0,
554 0};
555
556 return WaitEventMaskForInstPreGFX12;
557 }
558
559 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
560};
561
562class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
563public:
564 WaitcntGeneratorGFX12Plus() = default;
565 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
566 InstCounterType MaxCounter)
567 : WaitcntGenerator(MF, MaxCounter) {}
568
569 bool
570 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
571 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
572 MachineBasicBlock::instr_iterator It) const override;
573
574 bool createNewWaitcnt(MachineBasicBlock &Block,
576 AMDGPU::Waitcnt Wait) override;
577
578 const unsigned *getWaitEventMask() const override {
579 assert(ST);
580
581 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
582 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
583 eventMask({LDS_ACCESS, GDS_ACCESS}),
584 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
585 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
586 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
587 eventMask({VMEM_SAMPLER_READ_ACCESS}),
588 eventMask({VMEM_BVH_READ_ACCESS}),
589 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
590
591 return WaitEventMaskForInstGFX12Plus;
592 }
593
594 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
595};
596
597class SIInsertWaitcnts : public MachineFunctionPass {
598private:
599 const GCNSubtarget *ST = nullptr;
600 const SIInstrInfo *TII = nullptr;
601 const SIRegisterInfo *TRI = nullptr;
602 const MachineRegisterInfo *MRI = nullptr;
603
605 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
606 MachineLoopInfo *MLI;
608 AliasAnalysis *AA = nullptr;
609
610 struct BlockInfo {
611 std::unique_ptr<WaitcntBrackets> Incoming;
612 bool Dirty = true;
613 };
614
615 InstCounterType SmemAccessCounter;
616
618
619 bool ForceEmitWaitcnt[NUM_INST_CNTS];
620
621 // In any given run of this pass, WCG will point to one of these two
622 // generator objects, which must have been re-initialised before use
623 // from a value made using a subtarget constructor.
624 WaitcntGeneratorPreGFX12 WCGPreGFX12;
625 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
626
627 WaitcntGenerator *WCG = nullptr;
628
629 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
630 // message.
631 DenseSet<MachineInstr *> ReleaseVGPRInsts;
632
633 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
634
635public:
636 static char ID;
637
638 SIInsertWaitcnts() : MachineFunctionPass(ID) {
639 (void)ForceExpCounter;
640 (void)ForceLgkmCounter;
641 (void)ForceVMCounter;
642 }
643
644 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
645 bool isPreheaderToFlush(MachineBasicBlock &MBB,
646 WaitcntBrackets &ScoreBrackets);
647 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
648 bool runOnMachineFunction(MachineFunction &MF) override;
649
650 StringRef getPassName() const override {
651 return "SI insert wait instructions";
652 }
653
654 void getAnalysisUsage(AnalysisUsage &AU) const override {
655 AU.setPreservesCFG();
661 }
662
663 bool isForceEmitWaitcnt() const {
664 for (auto T : inst_counter_types())
665 if (ForceEmitWaitcnt[T])
666 return true;
667 return false;
668 }
669
670 void setForceEmitWaitcnt() {
671// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
672// For debug builds, get the debug counter info and adjust if need be
673#ifndef NDEBUG
674 if (DebugCounter::isCounterSet(ForceExpCounter) &&
675 DebugCounter::shouldExecute(ForceExpCounter)) {
676 ForceEmitWaitcnt[EXP_CNT] = true;
677 } else {
678 ForceEmitWaitcnt[EXP_CNT] = false;
679 }
680
681 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
682 DebugCounter::shouldExecute(ForceLgkmCounter)) {
683 ForceEmitWaitcnt[DS_CNT] = true;
684 ForceEmitWaitcnt[KM_CNT] = true;
685 } else {
686 ForceEmitWaitcnt[DS_CNT] = false;
687 ForceEmitWaitcnt[KM_CNT] = false;
688 }
689
690 if (DebugCounter::isCounterSet(ForceVMCounter) &&
691 DebugCounter::shouldExecute(ForceVMCounter)) {
692 ForceEmitWaitcnt[LOAD_CNT] = true;
693 ForceEmitWaitcnt[SAMPLE_CNT] = true;
694 ForceEmitWaitcnt[BVH_CNT] = true;
695 } else {
696 ForceEmitWaitcnt[LOAD_CNT] = false;
697 ForceEmitWaitcnt[SAMPLE_CNT] = false;
698 ForceEmitWaitcnt[BVH_CNT] = false;
699 }
700#endif // NDEBUG
701 }
702
703 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
704 // FLAT instruction.
705 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
706 // Maps VMEM access types to their corresponding WaitEventType.
707 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
708 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
709
711 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
712 // these should use VM_CNT.
713 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
714 return VMEM_ACCESS;
715 if (Inst.mayStore() &&
716 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
717 // FLAT and SCRATCH instructions may access scratch. Other VMEM
718 // instructions do not.
719 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
720 return SCRATCH_WRITE_ACCESS;
721 return VMEM_WRITE_ACCESS;
722 }
723 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
724 return VMEM_READ_ACCESS;
725 return VmemReadMapping[getVmemType(Inst)];
726 }
727
728 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
729 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
730 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
731 bool generateWaitcntInstBefore(MachineInstr &MI,
732 WaitcntBrackets &ScoreBrackets,
733 MachineInstr *OldWaitcntInstr,
734 bool FlushVmCnt);
735 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
737 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
738 MachineInstr *OldWaitcntInstr);
739 void updateEventWaitcntAfter(MachineInstr &Inst,
740 WaitcntBrackets *ScoreBrackets);
741 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
742 WaitcntBrackets &ScoreBrackets);
743};
744
745} // end anonymous namespace
746
747RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
749 const SIRegisterInfo *TRI,
750 const MachineOperand &Op) const {
751 if (!TRI->isInAllocatableClass(Op.getReg()))
752 return {-1, -1};
753
754 // A use via a PW operand does not need a waitcnt.
755 // A partial write is not a WAW.
756 assert(!Op.getSubReg() || !Op.isUndef());
757
758 RegInterval Result;
759
760 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
762
763 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
764 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
765 Result.first = Reg - Encoding.VGPR0;
766 if (TRI->isAGPR(*MRI, Op.getReg()))
767 Result.first += AGPR_OFFSET;
768 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
769 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
770 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
771 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
772 assert(Result.first >= NUM_ALL_VGPRS &&
773 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
774 }
775 // TODO: Handle TTMP
776 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
777 else
778 return {-1, -1};
779
780 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
781 unsigned Size = TRI->getRegSizeInBits(*RC);
782 Result.second = Result.first + ((Size + 16) / 32);
783
784 return Result;
785}
786
787void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
788 InstCounterType CntTy,
789 unsigned Score) {
790 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
791 if (RegNo < NUM_ALL_VGPRS) {
792 VgprUB = std::max(VgprUB, RegNo);
793 VgprScores[CntTy][RegNo] = Score;
794 } else {
795 assert(CntTy == SmemAccessCounter);
796 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
797 SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
798 }
799 }
800}
801
802void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
803 const SIRegisterInfo *TRI,
805 const MachineOperand &Op,
806 InstCounterType CntTy, unsigned Score) {
807 RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
808 setScoreByInterval(Interval, CntTy, Score);
809}
810
811void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
812 const SIRegisterInfo *TRI,
814 WaitEventType E, MachineInstr &Inst) {
815 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
816
817 unsigned UB = getScoreUB(T);
818 unsigned CurrScore = UB + 1;
819 if (CurrScore == 0)
820 report_fatal_error("InsertWaitcnt score wraparound");
821 // PendingEvents and ScoreUB need to be update regardless if this event
822 // changes the score of a register or not.
823 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
824 PendingEvents |= 1 << E;
825 setScoreUB(T, CurrScore);
826
827 if (T == EXP_CNT) {
828 // Put score on the source vgprs. If this is a store, just use those
829 // specific register(s).
830 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
831 // All GDS operations must protect their address register (same as
832 // export.)
833 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
834 setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
835
836 if (Inst.mayStore()) {
837 if (const auto *Data0 =
838 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
839 setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
840 if (const auto *Data1 =
841 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
842 setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
843 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
844 Inst.getOpcode() != AMDGPU::DS_APPEND &&
845 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
846 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
847 for (const MachineOperand &Op : Inst.all_uses()) {
848 if (TRI->isVectorRegister(*MRI, Op.getReg()))
849 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
850 }
851 }
852 } else if (TII->isFLAT(Inst)) {
853 if (Inst.mayStore()) {
854 setScoreByOperand(&Inst, TRI, MRI,
855 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
856 EXP_CNT, CurrScore);
857 } else if (SIInstrInfo::isAtomicRet(Inst)) {
858 setScoreByOperand(&Inst, TRI, MRI,
859 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
860 EXP_CNT, CurrScore);
861 }
862 } else if (TII->isMIMG(Inst)) {
863 if (Inst.mayStore()) {
864 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
865 CurrScore);
866 } else if (SIInstrInfo::isAtomicRet(Inst)) {
867 setScoreByOperand(&Inst, TRI, MRI,
868 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
869 EXP_CNT, CurrScore);
870 }
871 } else if (TII->isMTBUF(Inst)) {
872 if (Inst.mayStore())
873 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
874 CurrScore);
875 } else if (TII->isMUBUF(Inst)) {
876 if (Inst.mayStore()) {
877 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
878 CurrScore);
879 } else if (SIInstrInfo::isAtomicRet(Inst)) {
880 setScoreByOperand(&Inst, TRI, MRI,
881 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
882 EXP_CNT, CurrScore);
883 }
884 } else if (TII->isLDSDIR(Inst)) {
885 // LDSDIR instructions attach the score to the destination.
886 setScoreByOperand(&Inst, TRI, MRI,
887 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
888 EXP_CNT, CurrScore);
889 } else {
890 if (TII->isEXP(Inst)) {
891 // For export the destination registers are really temps that
892 // can be used as the actual source after export patching, so
893 // we need to treat them like sources and set the EXP_CNT
894 // score.
895 for (MachineOperand &DefMO : Inst.all_defs()) {
896 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
897 setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
898 }
899 }
900 }
901 for (const MachineOperand &Op : Inst.all_uses()) {
902 if (TRI->isVectorRegister(*MRI, Op.getReg()))
903 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
904 }
905 }
906 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
907 // Match the score to the destination registers.
908 //
909 // Check only explicit operands. Stores, especially spill stores, include
910 // implicit uses and defs of their super registers which would create an
911 // artificial dependency, while these are there only for register liveness
912 // accounting purposes.
913 //
914 // Special cases where implicit register defs exists, such as M0 or VCC,
915 // but none with memory instructions.
916 for (const MachineOperand &Op : Inst.defs()) {
917 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
918 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
919 if (Interval.first >= NUM_ALL_VGPRS)
920 continue;
921 if (updateVMCntOnly(Inst)) {
922 // updateVMCntOnly should only leave us with VGPRs
923 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
924 // defs. That's required for a sane index into `VgprMemTypes` below
925 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
926 VmemType V = getVmemType(Inst);
927 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
928 VgprVmemTypes[RegNo] |= 1 << V;
929 }
930 }
931 setScoreByInterval(Interval, T, CurrScore);
932 }
933 if (Inst.mayStore() &&
934 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
935 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
936 // written can be accessed. A load from LDS to VMEM does not need a wait.
937 unsigned Slot = 0;
938 for (const auto *MemOp : Inst.memoperands()) {
939 if (!MemOp->isStore() ||
940 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
941 continue;
942 // Comparing just AA info does not guarantee memoperands are equal
943 // in general, but this is so for LDS DMA in practice.
944 auto AAI = MemOp->getAAInfo();
945 // Alias scope information gives a way to definitely identify an
946 // original memory object and practically produced in the module LDS
947 // lowering pass. If there is no scope available we will not be able
948 // to disambiguate LDS aliasing as after the module lowering all LDS
949 // is squashed into a single big object. Do not attempt to use one of
950 // the limited LDSDMAStores for something we will not be able to use
951 // anyway.
952 if (!AAI || !AAI.Scope)
953 break;
954 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
955 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
956 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
957 Slot = I + 1;
958 break;
959 }
960 }
961 }
962 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
963 break;
964 LDSDMAStores.push_back(&Inst);
965 Slot = LDSDMAStores.size();
966 break;
967 }
968 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
969 if (Slot)
970 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
971 }
972 }
973}
974
975void WaitcntBrackets::print(raw_ostream &OS) {
976 OS << '\n';
977 for (auto T : inst_counter_types(MaxCounter)) {
978 unsigned SR = getScoreRange(T);
979
980 switch (T) {
981 case LOAD_CNT:
982 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
983 << SR << "): ";
984 break;
985 case DS_CNT:
986 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
987 << SR << "): ";
988 break;
989 case EXP_CNT:
990 OS << " EXP_CNT(" << SR << "): ";
991 break;
992 case STORE_CNT:
993 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
994 << SR << "): ";
995 break;
996 case SAMPLE_CNT:
997 OS << " SAMPLE_CNT(" << SR << "): ";
998 break;
999 case BVH_CNT:
1000 OS << " BVH_CNT(" << SR << "): ";
1001 break;
1002 case KM_CNT:
1003 OS << " KM_CNT(" << SR << "): ";
1004 break;
1005 default:
1006 OS << " UNKNOWN(" << SR << "): ";
1007 break;
1008 }
1009
1010 if (SR != 0) {
1011 // Print vgpr scores.
1012 unsigned LB = getScoreLB(T);
1013
1014 for (int J = 0; J <= VgprUB; J++) {
1015 unsigned RegScore = getRegScore(J, T);
1016 if (RegScore <= LB)
1017 continue;
1018 unsigned RelScore = RegScore - LB - 1;
1019 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1020 OS << RelScore << ":v" << J << " ";
1021 } else {
1022 OS << RelScore << ":ds ";
1023 }
1024 }
1025 // Also need to print sgpr scores for lgkm_cnt.
1026 if (T == SmemAccessCounter) {
1027 for (int J = 0; J <= SgprUB; J++) {
1028 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1029 if (RegScore <= LB)
1030 continue;
1031 unsigned RelScore = RegScore - LB - 1;
1032 OS << RelScore << ":s" << J << " ";
1033 }
1034 }
1035 }
1036 OS << '\n';
1037 }
1038 OS << '\n';
1039}
1040
1041/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1042/// whether a waitcnt instruction is needed at all.
1043void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1044 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1045 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1046 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1047 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1048 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1049 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1050 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1051}
1052
1053void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1054 unsigned &Count) const {
1055 // The number of outstanding events for this type, T, can be calculated
1056 // as (UB - LB). If the current Count is greater than or equal to the number
1057 // of outstanding events, then the wait for this counter is redundant.
1058 if (Count >= getScoreRange(T))
1059 Count = ~0u;
1060}
1061
1062void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1063 AMDGPU::Waitcnt &Wait) const {
1064 const unsigned LB = getScoreLB(T);
1065 const unsigned UB = getScoreUB(T);
1066 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1067 unsigned ScoreToWait = getRegScore(RegNo, T);
1068
1069 // If the score of src_operand falls within the bracket, we need an
1070 // s_waitcnt instruction.
1071 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1072 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1073 !ST->hasFlatLgkmVMemCountInOrder()) {
1074 // If there is a pending FLAT operation, and this is a VMem or LGKM
1075 // waitcnt and the target can report early completion, then we need
1076 // to force a waitcnt 0.
1077 addWait(Wait, T, 0);
1078 } else if (counterOutOfOrder(T)) {
1079 // Counter can get decremented out-of-order when there
1080 // are multiple types event in the bracket. Also emit an s_wait counter
1081 // with a conservative value of 0 for the counter.
1082 addWait(Wait, T, 0);
1083 } else {
1084 // If a counter has been maxed out avoid overflow by waiting for
1085 // MAX(CounterType) - 1 instead.
1086 unsigned NeededWait =
1087 std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1088 addWait(Wait, T, NeededWait);
1089 }
1090 }
1091 }
1092}
1093
1094void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1095 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1096 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1097 applyWaitcnt(DS_CNT, Wait.DsCnt);
1098 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1099 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1100 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1101 applyWaitcnt(KM_CNT, Wait.KmCnt);
1102}
1103
1104void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1105 const unsigned UB = getScoreUB(T);
1106 if (Count >= UB)
1107 return;
1108 if (Count != 0) {
1109 if (counterOutOfOrder(T))
1110 return;
1111 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1112 } else {
1113 setScoreLB(T, UB);
1114 PendingEvents &= ~WaitEventMaskForInst[T];
1115 }
1116}
1117
1118// Where there are multiple types of event in the bracket of a counter,
1119// the decrement may go out of order.
1120bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1121 // Scalar memory read always can go out of order.
1122 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1123 return true;
1124 return hasMixedPendingEvents(T);
1125}
1126
1127INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1128 false)
1131INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1132 false)
1133
1134char SIInsertWaitcnts::ID = 0;
1135
1136char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1137
1139 return new SIInsertWaitcnts();
1140}
1141
1142static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1143 unsigned NewEnc) {
1144 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1145 assert(OpIdx >= 0);
1146
1147 MachineOperand &MO = MI.getOperand(OpIdx);
1148
1149 if (NewEnc == MO.getImm())
1150 return false;
1151
1152 MO.setImm(NewEnc);
1153 return true;
1154}
1155
1156/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1157/// and if so, which counter it is waiting on.
1158static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1159 switch (Opcode) {
1160 case AMDGPU::S_WAIT_LOADCNT:
1161 return LOAD_CNT;
1162 case AMDGPU::S_WAIT_EXPCNT:
1163 return EXP_CNT;
1164 case AMDGPU::S_WAIT_STORECNT:
1165 return STORE_CNT;
1166 case AMDGPU::S_WAIT_SAMPLECNT:
1167 return SAMPLE_CNT;
1168 case AMDGPU::S_WAIT_BVHCNT:
1169 return BVH_CNT;
1170 case AMDGPU::S_WAIT_DSCNT:
1171 return DS_CNT;
1172 case AMDGPU::S_WAIT_KMCNT:
1173 return KM_CNT;
1174 default:
1175 return {};
1176 }
1177}
1178
1179bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1180 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1181 if (Opcode == Waitcnt->getOpcode())
1182 return false;
1183
1184 Waitcnt->setDesc(TII->get(Opcode));
1185 return true;
1186}
1187
1188/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1189/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1190/// from \p Wait that were added by previous passes. Currently this pass
1191/// conservatively assumes that these preexisting waits are required for
1192/// correctness.
1193bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1194 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1196 assert(ST);
1197 assert(isNormalMode(MaxCounter));
1198
1199 bool Modified = false;
1200 MachineInstr *WaitcntInstr = nullptr;
1201 MachineInstr *WaitcntVsCntInstr = nullptr;
1202
1203 for (auto &II :
1204 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1205 if (II.isMetaInstruction())
1206 continue;
1207
1208 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1209 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1210
1211 // Update required wait count. If this is a soft waitcnt (= it was added
1212 // by an earlier pass), it may be entirely removed.
1213 if (Opcode == AMDGPU::S_WAITCNT) {
1214 unsigned IEnc = II.getOperand(0).getImm();
1215 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1216 if (TrySimplify)
1217 ScoreBrackets.simplifyWaitcnt(OldWait);
1218 Wait = Wait.combined(OldWait);
1219
1220 // Merge consecutive waitcnt of the same type by erasing multiples.
1221 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1222 II.eraseFromParent();
1223 Modified = true;
1224 } else
1225 WaitcntInstr = &II;
1226 } else {
1227 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1228 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1229
1230 unsigned OldVSCnt =
1231 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1232 if (TrySimplify)
1233 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1234 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1235
1236 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1237 II.eraseFromParent();
1238 Modified = true;
1239 } else
1240 WaitcntVsCntInstr = &II;
1241 }
1242 }
1243
1244 if (WaitcntInstr) {
1245 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1247 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1248
1249 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1250 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1251 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1252 Wait.LoadCnt = ~0u;
1253 Wait.ExpCnt = ~0u;
1254 Wait.DsCnt = ~0u;
1255
1256 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1257 ? dbgs()
1258 << "applyPreexistingWaitcnt\n"
1259 << "New Instr at block end: " << *WaitcntInstr << '\n'
1260 : dbgs() << "applyPreexistingWaitcnt\n"
1261 << "Old Instr: " << *It
1262 << "New Instr: " << *WaitcntInstr << '\n');
1263 }
1264
1265 if (WaitcntVsCntInstr) {
1266 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1267 AMDGPU::OpName::simm16, Wait.StoreCnt);
1268 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1269
1270 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1271 Wait.StoreCnt = ~0u;
1272
1273 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1274 ? dbgs() << "applyPreexistingWaitcnt\n"
1275 << "New Instr at block end: " << *WaitcntVsCntInstr
1276 << '\n'
1277 : dbgs() << "applyPreexistingWaitcnt\n"
1278 << "Old Instr: " << *It
1279 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1280 }
1281
1282 return Modified;
1283}
1284
1285/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1286/// required counters in \p Wait
1287bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1290 assert(ST);
1291 assert(isNormalMode(MaxCounter));
1292
1293 bool Modified = false;
1294 const DebugLoc &DL = Block.findDebugLoc(It);
1295
1296 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1297 // single instruction while VScnt has its own instruction.
1298 if (Wait.hasWaitExceptStoreCnt()) {
1299 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1300 [[maybe_unused]] auto SWaitInst =
1301 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1302 Modified = true;
1303
1304 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1305 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1306 dbgs() << "New Instr: " << *SWaitInst << '\n');
1307 }
1308
1309 if (Wait.hasWaitStoreCnt()) {
1310 assert(ST->hasVscnt());
1311
1312 [[maybe_unused]] auto SWaitInst =
1313 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1314 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1315 .addImm(Wait.StoreCnt);
1316 Modified = true;
1317
1318 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1319 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1320 dbgs() << "New Instr: " << *SWaitInst << '\n');
1321 }
1322
1323 return Modified;
1324}
1325
1327WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1328 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1329}
1330
1332WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1333 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1334}
1335
1336/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1337/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1338/// were added by previous passes. Currently this pass conservatively
1339/// assumes that these preexisting waits are required for correctness.
1340bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1341 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1343 assert(ST);
1344 assert(!isNormalMode(MaxCounter));
1345
1346 bool Modified = false;
1347 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1348 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1349 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1350
1351 for (auto &II :
1352 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1353 if (II.isMetaInstruction())
1354 continue;
1355
1356 MachineInstr **UpdatableInstr;
1357
1358 // Update required wait count. If this is a soft waitcnt (= it was added
1359 // by an earlier pass), it may be entirely removed.
1360
1361 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1362 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1363
1364 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1365 // attempt to do more than that either.
1366 if (Opcode == AMDGPU::S_WAITCNT)
1367 continue;
1368
1369 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1370 unsigned OldEnc =
1371 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1373 if (TrySimplify)
1374 ScoreBrackets.simplifyWaitcnt(OldWait);
1375 Wait = Wait.combined(OldWait);
1376 UpdatableInstr = &CombinedLoadDsCntInstr;
1377 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1378 unsigned OldEnc =
1379 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1381 if (TrySimplify)
1382 ScoreBrackets.simplifyWaitcnt(OldWait);
1383 Wait = Wait.combined(OldWait);
1384 UpdatableInstr = &CombinedStoreDsCntInstr;
1385 } else {
1386 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1387 assert(CT.has_value());
1388 unsigned OldCnt =
1389 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1390 if (TrySimplify)
1391 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1392 addWait(Wait, CT.value(), OldCnt);
1393 UpdatableInstr = &WaitInstrs[CT.value()];
1394 }
1395
1396 // Merge consecutive waitcnt of the same type by erasing multiples.
1397 if (!*UpdatableInstr) {
1398 *UpdatableInstr = &II;
1399 } else {
1400 II.eraseFromParent();
1401 Modified = true;
1402 }
1403 }
1404
1405 if (CombinedLoadDsCntInstr) {
1406 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1407 // to be waited for. Otherwise, let the instruction be deleted so
1408 // the appropriate single counter wait instruction can be inserted
1409 // instead, when new S_WAIT_*CNT instructions are inserted by
1410 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1411 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1412 // the loop below that deals with single counter instructions.
1413 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1414 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1415 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1416 AMDGPU::OpName::simm16, NewEnc);
1417 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1418 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1419 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1420 Wait.LoadCnt = ~0u;
1421 Wait.DsCnt = ~0u;
1422
1423 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1424 ? dbgs() << "applyPreexistingWaitcnt\n"
1425 << "New Instr at block end: "
1426 << *CombinedLoadDsCntInstr << '\n'
1427 : dbgs() << "applyPreexistingWaitcnt\n"
1428 << "Old Instr: " << *It << "New Instr: "
1429 << *CombinedLoadDsCntInstr << '\n');
1430 } else {
1431 CombinedLoadDsCntInstr->eraseFromParent();
1432 Modified = true;
1433 }
1434 }
1435
1436 if (CombinedStoreDsCntInstr) {
1437 // Similarly for S_WAIT_STORECNT_DSCNT.
1438 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1439 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1440 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1441 AMDGPU::OpName::simm16, NewEnc);
1442 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1443 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1444 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1445 Wait.StoreCnt = ~0u;
1446 Wait.DsCnt = ~0u;
1447
1448 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1449 ? dbgs() << "applyPreexistingWaitcnt\n"
1450 << "New Instr at block end: "
1451 << *CombinedStoreDsCntInstr << '\n'
1452 : dbgs() << "applyPreexistingWaitcnt\n"
1453 << "Old Instr: " << *It << "New Instr: "
1454 << *CombinedStoreDsCntInstr << '\n');
1455 } else {
1456 CombinedStoreDsCntInstr->eraseFromParent();
1457 Modified = true;
1458 }
1459 }
1460
1461 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1462 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1463 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1464 // instructions so that createNewWaitcnt() will create new combined
1465 // instructions to replace them.
1466
1467 if (Wait.DsCnt != ~0u) {
1468 // This is a vector of addresses in WaitInstrs pointing to instructions
1469 // that should be removed if they are present.
1471
1472 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1473 // both) need to be waited for, ensure that there are no existing
1474 // individual wait count instructions for these.
1475
1476 if (Wait.LoadCnt != ~0u) {
1477 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1478 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1479 } else if (Wait.StoreCnt != ~0u) {
1480 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1481 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1482 }
1483
1484 for (MachineInstr **WI : WaitsToErase) {
1485 if (!*WI)
1486 continue;
1487
1488 (*WI)->eraseFromParent();
1489 *WI = nullptr;
1490 Modified = true;
1491 }
1492 }
1493
1494 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1495 if (!WaitInstrs[CT])
1496 continue;
1497
1498 unsigned NewCnt = getWait(Wait, CT);
1499 if (NewCnt != ~0u) {
1500 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1501 AMDGPU::OpName::simm16, NewCnt);
1502 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1503
1504 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1505 setNoWait(Wait, CT);
1506
1507 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1508 ? dbgs() << "applyPreexistingWaitcnt\n"
1509 << "New Instr at block end: " << *WaitInstrs[CT]
1510 << '\n'
1511 : dbgs() << "applyPreexistingWaitcnt\n"
1512 << "Old Instr: " << *It
1513 << "New Instr: " << *WaitInstrs[CT] << '\n');
1514 } else {
1515 WaitInstrs[CT]->eraseFromParent();
1516 Modified = true;
1517 }
1518 }
1519
1520 return Modified;
1521}
1522
1523/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1524bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1527 assert(ST);
1528 assert(!isNormalMode(MaxCounter));
1529
1530 bool Modified = false;
1531 const DebugLoc &DL = Block.findDebugLoc(It);
1532
1533 // Check for opportunities to use combined wait instructions.
1534 if (Wait.DsCnt != ~0u) {
1535 MachineInstr *SWaitInst = nullptr;
1536
1537 if (Wait.LoadCnt != ~0u) {
1538 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1539
1540 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1541 .addImm(Enc);
1542
1543 Wait.LoadCnt = ~0u;
1544 Wait.DsCnt = ~0u;
1545 } else if (Wait.StoreCnt != ~0u) {
1546 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1547
1548 SWaitInst =
1549 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1550 .addImm(Enc);
1551
1552 Wait.StoreCnt = ~0u;
1553 Wait.DsCnt = ~0u;
1554 }
1555
1556 if (SWaitInst) {
1557 Modified = true;
1558
1559 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1560 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1561 dbgs() << "New Instr: " << *SWaitInst << '\n');
1562 }
1563 }
1564
1565 // Generate an instruction for any remaining counter that needs
1566 // waiting for.
1567
1568 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1569 unsigned Count = getWait(Wait, CT);
1570 if (Count == ~0u)
1571 continue;
1572
1573 [[maybe_unused]] auto SWaitInst =
1574 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1575 .addImm(Count);
1576
1577 Modified = true;
1578
1579 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1580 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1581 dbgs() << "New Instr: " << *SWaitInst << '\n');
1582 }
1583
1584 return Modified;
1585}
1586
1587static bool readsVCCZ(const MachineInstr &MI) {
1588 unsigned Opc = MI.getOpcode();
1589 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1590 !MI.getOperand(1).isUndef();
1591}
1592
1593/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1595 // Currently all conventions wait, but this may not always be the case.
1596 //
1597 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1598 // senses to omit the wait and do it in the caller.
1599 return true;
1600}
1601
1602/// \returns true if the callee is expected to wait for any outstanding waits
1603/// before returning.
1604static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1605
1606/// Generate s_waitcnt instruction to be placed before cur_Inst.
1607/// Instructions of a given type are returned in order,
1608/// but instructions of different types can complete out of order.
1609/// We rely on this in-order completion
1610/// and simply assign a score to the memory access instructions.
1611/// We keep track of the active "score bracket" to determine
1612/// if an access of a memory read requires an s_waitcnt
1613/// and if so what the value of each counter is.
1614/// The "score bracket" is bound by the lower bound and upper bound
1615/// scores (*_score_LB and *_score_ub respectively).
1616/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1617/// flush the vmcnt counter here.
1618bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1619 WaitcntBrackets &ScoreBrackets,
1620 MachineInstr *OldWaitcntInstr,
1621 bool FlushVmCnt) {
1622 setForceEmitWaitcnt();
1623
1624 if (MI.isMetaInstruction())
1625 return false;
1626
1628
1629 // FIXME: This should have already been handled by the memory legalizer.
1630 // Removing this currently doesn't affect any lit tests, but we need to
1631 // verify that nothing was relying on this. The number of buffer invalidates
1632 // being handled here should not be expanded.
1633 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1634 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1635 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1636 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1637 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1638 Wait.LoadCnt = 0;
1639 }
1640
1641 // All waits must be resolved at call return.
1642 // NOTE: this could be improved with knowledge of all call sites or
1643 // with knowledge of the called routines.
1644 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1645 MI.getOpcode() == AMDGPU::SI_RETURN ||
1646 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1647 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1648 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1649 }
1650 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1651 // stores. In this case it can be useful to send a message to explicitly
1652 // release all VGPRs before the stores have completed, but it is only safe to
1653 // do this if:
1654 // * there are no outstanding scratch stores
1655 // * we are not in Dynamic VGPR mode
1656 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1657 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1658 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1659 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1660 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1661 ReleaseVGPRInsts.insert(&MI);
1662 }
1663 // Resolve vm waits before gs-done.
1664 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1665 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1666 ST->hasLegacyGeometry() &&
1667 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1669 Wait.LoadCnt = 0;
1670 }
1671
1672 // Export & GDS instructions do not read the EXEC mask until after the export
1673 // is granted (which can occur well after the instruction is issued).
1674 // The shader program must flush all EXP operations on the export-count
1675 // before overwriting the EXEC mask.
1676 else {
1677 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1678 // Export and GDS are tracked individually, either may trigger a waitcnt
1679 // for EXEC.
1680 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1681 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1682 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1683 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1684 Wait.ExpCnt = 0;
1685 }
1686 }
1687
1688 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1689 // The function is going to insert a wait on everything in its prolog.
1690 // This still needs to be careful if the call target is a load (e.g. a GOT
1691 // load). We also need to check WAW dependency with saved PC.
1693
1694 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1695 if (CallAddrOp.isReg()) {
1696 RegInterval CallAddrOpInterval =
1697 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
1698
1699 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1700 Wait);
1701
1702 if (const auto *RtnAddrOp =
1703 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1704 RegInterval RtnAddrOpInterval =
1705 ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
1706
1707 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1708 Wait);
1709 }
1710 }
1711 } else {
1712 // FIXME: Should not be relying on memoperands.
1713 // Look at the source operands of every instruction to see if
1714 // any of them results from a previous memory operation that affects
1715 // its current usage. If so, an s_waitcnt instruction needs to be
1716 // emitted.
1717 // If the source operand was defined by a load, add the s_waitcnt
1718 // instruction.
1719 //
1720 // Two cases are handled for destination operands:
1721 // 1) If the destination operand was defined by a load, add the s_waitcnt
1722 // instruction to guarantee the right WAW order.
1723 // 2) If a destination operand that was used by a recent export/store ins,
1724 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1725
1726 for (const MachineMemOperand *Memop : MI.memoperands()) {
1727 const Value *Ptr = Memop->getValue();
1728 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1729 addWait(Wait, SmemAccessCounter, 0);
1730 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1731 SLoadAddresses.erase(Ptr);
1732 }
1733 unsigned AS = Memop->getAddrSpace();
1735 continue;
1736 // No need to wait before load from VMEM to LDS.
1737 if (TII->mayWriteLDSThroughDMA(MI))
1738 continue;
1739
1740 // LOAD_CNT is only relevant to vgpr or LDS.
1741 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1742 bool FoundAliasingStore = false;
1743 // Only objects with alias scope info were added to LDSDMAScopes array.
1744 // In the absense of the scope info we will not be able to disambiguate
1745 // aliasing here. There is no need to try searching for a corresponding
1746 // store slot. This is conservatively correct because in that case we
1747 // will produce a wait using the first (general) LDS DMA wait slot which
1748 // will wait on all of them anyway.
1749 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1750 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1751 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1752 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1753 FoundAliasingStore = true;
1754 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1755 }
1756 }
1757 }
1758 if (!FoundAliasingStore)
1759 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1760 if (Memop->isStore()) {
1761 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1762 }
1763 }
1764
1765 // Loop over use and def operands.
1766 for (const MachineOperand &Op : MI.operands()) {
1767 if (!Op.isReg())
1768 continue;
1769
1770 // If the instruction does not read tied source, skip the operand.
1771 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1772 continue;
1773
1774 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
1775
1776 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1777 if (IsVGPR) {
1778 // Implicit VGPR defs and uses are never a part of the memory
1779 // instructions description and usually present to account for
1780 // super-register liveness.
1781 // TODO: Most of the other instructions also have implicit uses
1782 // for the liveness accounting only.
1783 if (Op.isImplicit() && MI.mayLoadOrStore())
1784 continue;
1785
1786 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1787 // previous write and this write are the same type of VMEM
1788 // instruction, in which case they are (in some architectures)
1789 // guaranteed to write their results in order anyway.
1790 if (Op.isUse() || !updateVMCntOnly(MI) ||
1791 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
1792 getVmemType(MI)) ||
1793 !ST->hasVmemWriteVgprInOrder()) {
1794 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
1795 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
1796 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
1797 ScoreBrackets.clearVgprVmemTypes(Interval);
1798 }
1799 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1800 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
1801 }
1802 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
1803 } else {
1804 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
1805 }
1806 }
1807 }
1808 }
1809
1810 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1811 // not, we need to ensure the subtarget is capable of backing off barrier
1812 // instructions in case there are any outstanding memory operations that may
1813 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1814 if (TII->isBarrierStart(MI.getOpcode()) &&
1815 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1816 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1817 }
1818
1819 // TODO: Remove this work-around, enable the assert for Bug 457939
1820 // after fixing the scheduler. Also, the Shader Compiler code is
1821 // independent of target.
1822 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1823 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1824 Wait.DsCnt = 0;
1825 }
1826 }
1827
1828 // Verify that the wait is actually needed.
1829 ScoreBrackets.simplifyWaitcnt(Wait);
1830
1831 // When forcing emit, we need to skip terminators because that would break the
1832 // terminators of the MBB if we emit a waitcnt between terminators.
1833 if (ForceEmitZeroFlag && !MI.isTerminator())
1834 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1835
1836 if (ForceEmitWaitcnt[LOAD_CNT])
1837 Wait.LoadCnt = 0;
1838 if (ForceEmitWaitcnt[EXP_CNT])
1839 Wait.ExpCnt = 0;
1840 if (ForceEmitWaitcnt[DS_CNT])
1841 Wait.DsCnt = 0;
1842 if (ForceEmitWaitcnt[SAMPLE_CNT])
1843 Wait.SampleCnt = 0;
1844 if (ForceEmitWaitcnt[BVH_CNT])
1845 Wait.BvhCnt = 0;
1846 if (ForceEmitWaitcnt[KM_CNT])
1847 Wait.KmCnt = 0;
1848
1849 if (FlushVmCnt) {
1850 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1851 Wait.LoadCnt = 0;
1852 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1853 Wait.SampleCnt = 0;
1854 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1855 Wait.BvhCnt = 0;
1856 }
1857
1858 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
1859 Wait.LoadCnt = 0;
1860
1861 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1862 OldWaitcntInstr);
1863}
1864
1865bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1868 WaitcntBrackets &ScoreBrackets,
1869 MachineInstr *OldWaitcntInstr) {
1870 bool Modified = false;
1871
1872 if (OldWaitcntInstr)
1873 // Try to merge the required wait with preexisting waitcnt instructions.
1874 // Also erase redundant waitcnt.
1875 Modified =
1876 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1877
1878 // Any counts that could have been applied to any existing waitcnt
1879 // instructions will have been done so, now deal with any remaining.
1880 ScoreBrackets.applyWaitcnt(Wait);
1881
1882 // ExpCnt can be merged into VINTERP.
1883 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1885 MachineOperand *WaitExp =
1886 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1887 if (Wait.ExpCnt < WaitExp->getImm()) {
1888 WaitExp->setImm(Wait.ExpCnt);
1889 Modified = true;
1890 }
1891 Wait.ExpCnt = ~0u;
1892
1893 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1894 << "Update Instr: " << *It);
1895 }
1896
1897 if (WCG->createNewWaitcnt(Block, It, Wait))
1898 Modified = true;
1899
1900 return Modified;
1901}
1902
1903// This is a flat memory operation. Check to see if it has memory tokens other
1904// than LDS. Other address spaces supported by flat memory operations involve
1905// global memory.
1906bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1907 assert(TII->isFLAT(MI));
1908
1909 // All flat instructions use the VMEM counter.
1910 assert(TII->usesVM_CNT(MI));
1911
1912 // If there are no memory operands then conservatively assume the flat
1913 // operation may access VMEM.
1914 if (MI.memoperands_empty())
1915 return true;
1916
1917 // See if any memory operand specifies an address space that involves VMEM.
1918 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1919 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1920 // (GDS) address space is not supported by flat operations. Therefore, simply
1921 // return true unless only the LDS address space is found.
1922 for (const MachineMemOperand *Memop : MI.memoperands()) {
1923 unsigned AS = Memop->getAddrSpace();
1925 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1926 return true;
1927 }
1928
1929 return false;
1930}
1931
1932// This is a flat memory operation. Check to see if it has memory tokens for
1933// either LDS or FLAT.
1934bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1935 assert(TII->isFLAT(MI));
1936
1937 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1938 if (!TII->usesLGKM_CNT(MI))
1939 return false;
1940
1941 // If in tgsplit mode then there can be no use of LDS.
1942 if (ST->isTgSplitEnabled())
1943 return false;
1944
1945 // If there are no memory operands then conservatively assume the flat
1946 // operation may access LDS.
1947 if (MI.memoperands_empty())
1948 return true;
1949
1950 // See if any memory operand specifies an address space that involves LDS.
1951 for (const MachineMemOperand *Memop : MI.memoperands()) {
1952 unsigned AS = Memop->getAddrSpace();
1954 return true;
1955 }
1956
1957 return false;
1958}
1959
1960// This is a flat memory operation. Check to see if it has memory tokens for
1961// either scratch or FLAT.
1962bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
1963 const MachineInstr &MI) const {
1964 assert(TII->isFLAT(MI));
1965
1966 // SCRATCH instructions always access scratch.
1967 if (TII->isFLATScratch(MI))
1968 return true;
1969
1970 // GLOBAL instructions never access scratch.
1971 if (TII->isFLATGlobal(MI))
1972 return false;
1973
1974 // If there are no memory operands then conservatively assume the flat
1975 // operation may access scratch.
1976 if (MI.memoperands_empty())
1977 return true;
1978
1979 // See if any memory operand specifies an address space that involves scratch.
1980 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
1981 unsigned AS = Memop->getAddrSpace();
1982 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
1983 });
1984}
1985
1987 auto Opc = Inst.getOpcode();
1988 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
1989 Opc == AMDGPU::GLOBAL_WBINV;
1990}
1991
1992void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1993 WaitcntBrackets *ScoreBrackets) {
1994 // Now look at the instruction opcode. If it is a memory access
1995 // instruction, update the upper-bound of the appropriate counter's
1996 // bracket and the destination operand scores.
1997 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
1998
1999 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2000 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2001 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2002 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2003 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2004 } else {
2005 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2006 }
2007 } else if (TII->isFLAT(Inst)) {
2008 // TODO: Track this properly.
2009 if (isCacheInvOrWBInst(Inst))
2010 return;
2011
2012 assert(Inst.mayLoadOrStore());
2013
2014 int FlatASCount = 0;
2015
2016 if (mayAccessVMEMThroughFlat(Inst)) {
2017 ++FlatASCount;
2018 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2019 Inst);
2020 }
2021
2022 if (mayAccessLDSThroughFlat(Inst)) {
2023 ++FlatASCount;
2024 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2025 }
2026
2027 // A Flat memory operation must access at least one address space.
2028 assert(FlatASCount);
2029
2030 // This is a flat memory operation that access both VMEM and LDS, so note it
2031 // - it will require that both the VM and LGKM be flushed to zero if it is
2032 // pending when a VM or LGKM dependency occurs.
2033 if (FlatASCount > 1)
2034 ScoreBrackets->setPendingFlat();
2035 } else if (SIInstrInfo::isVMEM(Inst) &&
2037 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2038 Inst);
2039
2040 if (ST->vmemWriteNeedsExpWaitcnt() &&
2041 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2042 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2043 }
2044 } else if (TII->isSMRD(Inst)) {
2045 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2046 } else if (Inst.isCall()) {
2047 if (callWaitsOnFunctionReturn(Inst)) {
2048 // Act as a wait on everything
2049 ScoreBrackets->applyWaitcnt(
2050 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2051 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2052 } else {
2053 // May need to way wait for anything.
2054 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2055 }
2056 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2057 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2058 } else if (TII->isVINTERP(Inst)) {
2059 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2060 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2061 } else if (SIInstrInfo::isEXP(Inst)) {
2062 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2064 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2065 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2066 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2067 else
2068 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2069 } else {
2070 switch (Inst.getOpcode()) {
2071 case AMDGPU::S_SENDMSG:
2072 case AMDGPU::S_SENDMSG_RTN_B32:
2073 case AMDGPU::S_SENDMSG_RTN_B64:
2074 case AMDGPU::S_SENDMSGHALT:
2075 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2076 break;
2077 case AMDGPU::S_MEMTIME:
2078 case AMDGPU::S_MEMREALTIME:
2079 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2080 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2081 case AMDGPU::S_BARRIER_LEAVE:
2082 case AMDGPU::S_GET_BARRIER_STATE_M0:
2083 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2084 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2085 break;
2086 }
2087 }
2088}
2089
2090bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2091 unsigned OtherScore) {
2092 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2093 unsigned OtherShifted =
2094 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2095 Score = std::max(MyShifted, OtherShifted);
2096 return OtherShifted > MyShifted;
2097}
2098
2099/// Merge the pending events and associater score brackets of \p Other into
2100/// this brackets status.
2101///
2102/// Returns whether the merge resulted in a change that requires tighter waits
2103/// (i.e. the merged brackets strictly dominate the original brackets).
2104bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2105 bool StrictDom = false;
2106
2107 VgprUB = std::max(VgprUB, Other.VgprUB);
2108 SgprUB = std::max(SgprUB, Other.SgprUB);
2109
2110 for (auto T : inst_counter_types(MaxCounter)) {
2111 // Merge event flags for this counter
2112 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2113 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2114 if (OtherEvents & ~OldEvents)
2115 StrictDom = true;
2116 PendingEvents |= OtherEvents;
2117
2118 // Merge scores for this counter
2119 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2120 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2121 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2122 if (NewUB < ScoreLBs[T])
2123 report_fatal_error("waitcnt score overflow");
2124
2125 MergeInfo M;
2126 M.OldLB = ScoreLBs[T];
2127 M.OtherLB = Other.ScoreLBs[T];
2128 M.MyShift = NewUB - ScoreUBs[T];
2129 M.OtherShift = NewUB - Other.ScoreUBs[T];
2130
2131 ScoreUBs[T] = NewUB;
2132
2133 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2134
2135 for (int J = 0; J <= VgprUB; J++)
2136 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2137
2138 if (T == SmemAccessCounter) {
2139 for (int J = 0; J <= SgprUB; J++)
2140 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2141 }
2142 }
2143
2144 for (int J = 0; J <= VgprUB; J++) {
2145 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2146 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2147 VgprVmemTypes[J] = NewVmemTypes;
2148 }
2149
2150 return StrictDom;
2151}
2152
2153static bool isWaitInstr(MachineInstr &Inst) {
2154 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2155 return Opcode == AMDGPU::S_WAITCNT ||
2156 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2157 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2158 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2159 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2160 counterTypeForInstr(Opcode).has_value();
2161}
2162
2163// Generate s_waitcnt instructions where needed.
2164bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2166 WaitcntBrackets &ScoreBrackets) {
2167 bool Modified = false;
2168
2169 LLVM_DEBUG({
2170 dbgs() << "*** Block" << Block.getNumber() << " ***";
2171 ScoreBrackets.dump();
2172 });
2173
2174 // Track the correctness of vccz through this basic block. There are two
2175 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2176 // ST->partialVCCWritesUpdateVCCZ().
2177 bool VCCZCorrect = true;
2178 if (ST->hasReadVCCZBug()) {
2179 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2180 // to vcc and then issued an smem load.
2181 VCCZCorrect = false;
2182 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2183 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2184 // to vcc_lo or vcc_hi.
2185 VCCZCorrect = false;
2186 }
2187
2188 // Walk over the instructions.
2189 MachineInstr *OldWaitcntInstr = nullptr;
2190
2191 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2192 E = Block.instr_end();
2193 Iter != E;) {
2194 MachineInstr &Inst = *Iter;
2195
2196 // Track pre-existing waitcnts that were added in earlier iterations or by
2197 // the memory legalizer.
2198 if (isWaitInstr(Inst)) {
2199 if (!OldWaitcntInstr)
2200 OldWaitcntInstr = &Inst;
2201 ++Iter;
2202 continue;
2203 }
2204
2205 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2206 isPreheaderToFlush(Block, ScoreBrackets);
2207
2208 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2209 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2210 FlushVmCnt);
2211 OldWaitcntInstr = nullptr;
2212
2213 // Restore vccz if it's not known to be correct already.
2214 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2215
2216 // Don't examine operands unless we need to track vccz correctness.
2217 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2218 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2219 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2220 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2221 if (!ST->partialVCCWritesUpdateVCCZ())
2222 VCCZCorrect = false;
2223 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2224 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2225 // vccz bit, so when we detect that an instruction may read from a
2226 // corrupt vccz bit, we need to:
2227 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2228 // operations to complete.
2229 // 2. Restore the correct value of vccz by writing the current value
2230 // of vcc back to vcc.
2231 if (ST->hasReadVCCZBug() &&
2232 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2233 // Writes to vcc while there's an outstanding smem read may get
2234 // clobbered as soon as any read completes.
2235 VCCZCorrect = false;
2236 } else {
2237 // Writes to vcc will fix any incorrect value in vccz.
2238 VCCZCorrect = true;
2239 }
2240 }
2241 }
2242
2243 if (TII->isSMRD(Inst)) {
2244 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2245 // No need to handle invariant loads when avoiding WAR conflicts, as
2246 // there cannot be a vector store to the same memory location.
2247 if (!Memop->isInvariant()) {
2248 const Value *Ptr = Memop->getValue();
2249 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2250 }
2251 }
2252 if (ST->hasReadVCCZBug()) {
2253 // This smem read could complete and clobber vccz at any time.
2254 VCCZCorrect = false;
2255 }
2256 }
2257
2258 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2259
2260 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2261 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2262 Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2263 ScoreBrackets.simplifyWaitcnt(Wait);
2264 Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2265 ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2266 }
2267
2268 LLVM_DEBUG({
2269 Inst.print(dbgs());
2270 ScoreBrackets.dump();
2271 });
2272
2273 // TODO: Remove this work-around after fixing the scheduler and enable the
2274 // assert above.
2275 if (RestoreVCCZ) {
2276 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2277 // bit is updated, so we can restore the bit by reading the value of
2278 // vcc and then writing it back to the register.
2279 BuildMI(Block, Inst, Inst.getDebugLoc(),
2280 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2281 TRI->getVCC())
2282 .addReg(TRI->getVCC());
2283 VCCZCorrect = true;
2284 Modified = true;
2285 }
2286
2287 ++Iter;
2288 }
2289
2290 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2291 // needed.
2293 if (Block.getFirstTerminator() == Block.end() &&
2294 isPreheaderToFlush(Block, ScoreBrackets)) {
2295 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2296 Wait.LoadCnt = 0;
2297 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2298 Wait.SampleCnt = 0;
2299 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2300 Wait.BvhCnt = 0;
2301 }
2302
2303 // Combine or remove any redundant waitcnts at the end of the block.
2304 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2305 OldWaitcntInstr);
2306
2307 return Modified;
2308}
2309
2310// Return true if the given machine basic block is a preheader of a loop in
2311// which we want to flush the vmcnt counter, and false otherwise.
2312bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2313 WaitcntBrackets &ScoreBrackets) {
2314 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2315 if (!IsInserted)
2316 return Iterator->second;
2317
2319 if (!Succ)
2320 return false;
2321
2322 MachineLoop *Loop = MLI->getLoopFor(Succ);
2323 if (!Loop)
2324 return false;
2325
2326 if (Loop->getLoopPreheader() == &MBB &&
2327 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2328 Iterator->second = true;
2329 return true;
2330 }
2331
2332 return false;
2333}
2334
2335bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2336 return SIInstrInfo::isVMEM(MI) ||
2337 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2338}
2339
2340// Return true if it is better to flush the vmcnt counter in the preheader of
2341// the given loop. We currently decide to flush in two situations:
2342// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2343// vgpr containing a value that is loaded outside of the loop. (Only on
2344// targets with no vscnt counter).
2345// 2. The loop contains vmem load(s), but the loaded values are not used in the
2346// loop, and at least one use of a vgpr containing a value that is loaded
2347// outside of the loop.
2348bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2349 WaitcntBrackets &Brackets) {
2350 bool HasVMemLoad = false;
2351 bool HasVMemStore = false;
2352 bool UsesVgprLoadedOutside = false;
2353 DenseSet<Register> VgprUse;
2354 DenseSet<Register> VgprDef;
2355
2356 for (MachineBasicBlock *MBB : ML->blocks()) {
2357 for (MachineInstr &MI : *MBB) {
2358 if (isVMEMOrFlatVMEM(MI)) {
2359 if (MI.mayLoad())
2360 HasVMemLoad = true;
2361 if (MI.mayStore())
2362 HasVMemStore = true;
2363 }
2364 for (const MachineOperand &Op : MI.all_uses()) {
2365 if (!TRI->isVectorRegister(*MRI, Op.getReg()))
2366 continue;
2367 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2368 // Vgpr use
2369 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2370 // If we find a register that is loaded inside the loop, 1. and 2.
2371 // are invalidated and we can exit.
2372 if (VgprDef.contains(RegNo))
2373 return false;
2374 VgprUse.insert(RegNo);
2375 // If at least one of Op's registers is in the score brackets, the
2376 // value is likely loaded outside of the loop.
2377 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2378 Brackets.getScoreLB(LOAD_CNT) ||
2379 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2380 Brackets.getScoreLB(SAMPLE_CNT) ||
2381 Brackets.getRegScore(RegNo, BVH_CNT) >
2382 Brackets.getScoreLB(BVH_CNT)) {
2383 UsesVgprLoadedOutside = true;
2384 break;
2385 }
2386 }
2387 }
2388
2389 // VMem load vgpr def
2390 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2391 for (const MachineOperand &Op : MI.all_defs()) {
2392 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2393 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2394 // If we find a register that is loaded inside the loop, 1. and 2.
2395 // are invalidated and we can exit.
2396 if (VgprUse.contains(RegNo))
2397 return false;
2398 VgprDef.insert(RegNo);
2399 }
2400 }
2401 }
2402 }
2403 }
2404 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2405 return true;
2406 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2407}
2408
2409bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2410 ST = &MF.getSubtarget<GCNSubtarget>();
2411 TII = ST->getInstrInfo();
2412 TRI = &TII->getRegisterInfo();
2413 MRI = &MF.getRegInfo();
2415 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2416 PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2417 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2418 AA = &AAR->getAAResults();
2419
2421
2422 if (ST->hasExtendedWaitCounts()) {
2423 MaxCounter = NUM_EXTENDED_INST_CNTS;
2424 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2425 WCG = &WCGGFX12Plus;
2426 } else {
2427 MaxCounter = NUM_NORMAL_INST_CNTS;
2428 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2429 WCG = &WCGPreGFX12;
2430 }
2431
2432 for (auto T : inst_counter_types())
2433 ForceEmitWaitcnt[T] = false;
2434
2435 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2436
2437 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2438
2439 HardwareLimits Limits = {};
2440 if (ST->hasExtendedWaitCounts()) {
2441 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2442 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2443 } else {
2444 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2445 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2446 }
2447 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2448 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2449 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2450 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2451 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2452
2453 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2454 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2455 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2456 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2457
2458 RegisterEncoding Encoding = {};
2459 Encoding.VGPR0 =
2460 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2461 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2462 Encoding.SGPR0 =
2463 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2464 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2465
2466 BlockInfos.clear();
2467 bool Modified = false;
2468
2469 MachineBasicBlock &EntryBB = MF.front();
2471
2472 if (!MFI->isEntryFunction()) {
2473 // Wait for any outstanding memory operations that the input registers may
2474 // depend on. We can't track them and it's better to do the wait after the
2475 // costly call sequence.
2476
2477 // TODO: Could insert earlier and schedule more liberally with operations
2478 // that only use caller preserved registers.
2479 for (MachineBasicBlock::iterator E = EntryBB.end();
2480 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2481 ;
2482
2483 if (ST->hasExtendedWaitCounts()) {
2484 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2485 .addImm(0);
2486 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2487 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2488 continue;
2489
2490 BuildMI(EntryBB, I, DebugLoc(),
2491 TII->get(instrsForExtendedCounterTypes[CT]))
2492 .addImm(0);
2493 }
2494 } else {
2495 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2496 }
2497
2498 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2499 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2500 SmemAccessCounter);
2501 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2502 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2503
2504 Modified = true;
2505 }
2506
2507 // Keep iterating over the blocks in reverse post order, inserting and
2508 // updating s_waitcnt where needed, until a fix point is reached.
2510 BlockInfos.insert({MBB, BlockInfo()});
2511
2512 std::unique_ptr<WaitcntBrackets> Brackets;
2513 bool Repeat;
2514 do {
2515 Repeat = false;
2516
2517 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2518 ++BII) {
2519 MachineBasicBlock *MBB = BII->first;
2520 BlockInfo &BI = BII->second;
2521 if (!BI.Dirty)
2522 continue;
2523
2524 if (BI.Incoming) {
2525 if (!Brackets)
2526 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2527 else
2528 *Brackets = *BI.Incoming;
2529 } else {
2530 if (!Brackets)
2531 Brackets = std::make_unique<WaitcntBrackets>(
2532 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2533 SmemAccessCounter);
2534 else
2535 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2536 WaitEventMaskForInst, SmemAccessCounter);
2537 }
2538
2539 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2540 BI.Dirty = false;
2541
2542 if (Brackets->hasPendingEvent()) {
2543 BlockInfo *MoveBracketsToSucc = nullptr;
2544 for (MachineBasicBlock *Succ : MBB->successors()) {
2545 auto *SuccBII = BlockInfos.find(Succ);
2546 BlockInfo &SuccBI = SuccBII->second;
2547 if (!SuccBI.Incoming) {
2548 SuccBI.Dirty = true;
2549 if (SuccBII <= BII)
2550 Repeat = true;
2551 if (!MoveBracketsToSucc) {
2552 MoveBracketsToSucc = &SuccBI;
2553 } else {
2554 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2555 }
2556 } else if (SuccBI.Incoming->merge(*Brackets)) {
2557 SuccBI.Dirty = true;
2558 if (SuccBII <= BII)
2559 Repeat = true;
2560 }
2561 }
2562 if (MoveBracketsToSucc)
2563 MoveBracketsToSucc->Incoming = std::move(Brackets);
2564 }
2565 }
2566 } while (Repeat);
2567
2568 if (ST->hasScalarStores()) {
2570 bool HaveScalarStores = false;
2571
2572 for (MachineBasicBlock &MBB : MF) {
2573 for (MachineInstr &MI : MBB) {
2574 if (!HaveScalarStores && TII->isScalarStore(MI))
2575 HaveScalarStores = true;
2576
2577 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2578 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2579 EndPgmBlocks.push_back(&MBB);
2580 }
2581 }
2582
2583 if (HaveScalarStores) {
2584 // If scalar writes are used, the cache must be flushed or else the next
2585 // wave to reuse the same scratch memory can be clobbered.
2586 //
2587 // Insert s_dcache_wb at wave termination points if there were any scalar
2588 // stores, and only if the cache hasn't already been flushed. This could
2589 // be improved by looking across blocks for flushes in postdominating
2590 // blocks from the stores but an explicitly requested flush is probably
2591 // very rare.
2592 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2593 bool SeenDCacheWB = false;
2594
2595 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2596 I != E; ++I) {
2597 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2598 SeenDCacheWB = true;
2599 else if (TII->isScalarStore(*I))
2600 SeenDCacheWB = false;
2601
2602 // FIXME: It would be better to insert this before a waitcnt if any.
2603 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2604 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2605 !SeenDCacheWB) {
2606 Modified = true;
2607 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2608 }
2609 }
2610 }
2611 }
2612 }
2613
2614 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2615 // instructions.
2616 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2617 // waveslot limited kernel runs slower with the deallocation.
2618 if (!ReleaseVGPRInsts.empty() &&
2619 (MF.getFrameInfo().hasCalls() ||
2620 ST->getOccupancyWithNumVGPRs(
2621 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2623 for (MachineInstr *MI : ReleaseVGPRInsts) {
2624 if (ST->requiresNopBeforeDeallocVGPRs()) {
2625 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2626 TII->get(AMDGPU::S_NOP))
2627 .addImm(0);
2628 }
2629 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2630 TII->get(AMDGPU::S_SENDMSG))
2632 Modified = true;
2633 }
2634 }
2635 ReleaseVGPRInsts.clear();
2636 PreheadersToFlush.clear();
2637 SLoadAddresses.clear();
2638
2639 return Modified;
2640}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1315
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:235
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:96
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:958
iterator_range< filtered_mop_iterator > all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
Definition: MachineInstr.h:774
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:730
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:790
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:501
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:764
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:441
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:645
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:658
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:700
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:597
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:842
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:579
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:637
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:605
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:682
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:981
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:850
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:589
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:621
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:674
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
self_iterator getIterator()
Definition: ilist_node.h:132
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
@ None
Definition: CodeGenData.h:106
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:130
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100