LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
66namespace {
67// Class of object that encapsulates latest instruction counter score
68// associated with the operand. Used for determining whether
69// s_waitcnt instruction needs to be emitted.
70
71enum InstCounterType {
72 LOAD_CNT = 0, // VMcnt prior to gfx12.
73 DS_CNT, // LKGMcnt prior to gfx12.
74 EXP_CNT, //
75 STORE_CNT, // VScnt in gfx10/gfx11.
76 NUM_NORMAL_INST_CNTS,
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
78 BVH_CNT, // gfx12+ only.
79 KM_CNT, // gfx12+ only.
80 X_CNT, // gfx1250.
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
83};
84} // namespace
85
86namespace llvm {
87template <> struct enum_iteration_traits<InstCounterType> {
88 static constexpr bool is_iterable = true;
89};
90} // namespace llvm
91
92namespace {
93// Return an iterator over all counters between LOAD_CNT (the first counter)
94// and \c MaxCounter (exclusive, default value yields an enumeration over
95// all counters).
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
98}
99
100using RegInterval = std::pair<int, int>;
101
102struct HardwareLimits {
103 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
104 unsigned ExpcntMax;
105 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
106 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
107 unsigned SamplecntMax; // gfx12+ only.
108 unsigned BvhcntMax; // gfx12+ only.
109 unsigned KmcntMax; // gfx12+ only.
110 unsigned XcntMax; // gfx1250.
111};
112
113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
114 DECL(VMEM_ACCESS) /* vmem read & write */ \
115 DECL(VMEM_READ_ACCESS) /* vmem read */ \
116 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
117 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
118 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
119 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
120 DECL(VMEM_GROUP) /* vmem group */ \
121 DECL(LDS_ACCESS) /* lds read & write */ \
122 DECL(GDS_ACCESS) /* gds read & write */ \
123 DECL(SQ_MESSAGE) /* send message */ \
124 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
125 DECL(SMEM_GROUP) /* scalar-memory group */ \
126 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
127 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
128 DECL(EXP_POS_ACCESS) /* write to export position */ \
129 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
130 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
131 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
132
133// clang-format off
134#define AMDGPU_EVENT_ENUM(Name) Name,
135enum WaitEventType {
137 NUM_WAIT_EVENTS
138};
139#undef AMDGPU_EVENT_ENUM
140
141#define AMDGPU_EVENT_NAME(Name) #Name,
142static constexpr StringLiteral WaitEventTypeName[] = {
144};
145#undef AMDGPU_EVENT_NAME
146// clang-format on
147
148// The mapping is:
149// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
150// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
151// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
152// We reserve a fixed number of VGPR slots in the scoring tables for
153// special tokens like SCMEM_LDS (needed for buffer load to LDS).
154enum RegisterMapping {
155 SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
156 AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
157 SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
158 // Artificial register slots to track LDS writes into specific LDS locations
159 // if a location is known. When slots are exhausted or location is
160 // unknown use the first slot. The first slot is also always updated in
161 // addition to known location's slot to properly generate waits if dependent
162 // instruction's location is unknown.
163 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
164 NUM_LDS_VGPRS = 9, // One more than the stores we track.
165 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
166};
167
168// Enumerate different types of result-returning VMEM operations. Although
169// s_waitcnt orders them all with a single vmcnt counter, in the absence of
170// s_waitcnt only instructions of the same VmemType are guaranteed to write
171// their results in order -- so there is no need to insert an s_waitcnt between
172// two instructions of the same type that write the same vgpr.
173enum VmemType {
174 // BUF instructions and MIMG instructions without a sampler.
175 VMEM_NOSAMPLER,
176 // MIMG instructions with a sampler.
177 VMEM_SAMPLER,
178 // BVH instructions
179 VMEM_BVH,
180 NUM_VMEM_TYPES
181};
182
183// Maps values of InstCounterType to the instruction that waits on that
184// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
185// returns true.
186static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
187 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
188 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
189 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
190
191static bool updateVMCntOnly(const MachineInstr &Inst) {
192 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
194}
195
196#ifndef NDEBUG
197static bool isNormalMode(InstCounterType MaxCounter) {
198 return MaxCounter == NUM_NORMAL_INST_CNTS;
199}
200#endif // NDEBUG
201
202VmemType getVmemType(const MachineInstr &Inst) {
203 assert(updateVMCntOnly(Inst));
204 if (!SIInstrInfo::isImage(Inst))
205 return VMEM_NOSAMPLER;
207 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
209
210 if (BaseInfo->BVH)
211 return VMEM_BVH;
212
213 // We have to make an additional check for isVSAMPLE here since some
214 // instructions don't have a sampler, but are still classified as sampler
215 // instructions for the purposes of e.g. waitcnt.
216 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
217 return VMEM_SAMPLER;
218
219 return VMEM_NOSAMPLER;
220}
221
222unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
223 switch (T) {
224 case LOAD_CNT:
225 return Wait.LoadCnt;
226 case EXP_CNT:
227 return Wait.ExpCnt;
228 case DS_CNT:
229 return Wait.DsCnt;
230 case STORE_CNT:
231 return Wait.StoreCnt;
232 case SAMPLE_CNT:
233 return Wait.SampleCnt;
234 case BVH_CNT:
235 return Wait.BvhCnt;
236 case KM_CNT:
237 return Wait.KmCnt;
238 case X_CNT:
239 return Wait.XCnt;
240 default:
241 llvm_unreachable("bad InstCounterType");
242 }
243}
244
245void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
246 unsigned &WC = getCounterRef(Wait, T);
247 WC = std::min(WC, Count);
248}
249
250void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
251 getCounterRef(Wait, T) = ~0u;
252}
253
254unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
255 return getCounterRef(Wait, T);
256}
257
258// Mapping from event to counter according to the table masks.
259InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
260 for (auto T : inst_counter_types()) {
261 if (masks[T] & (1 << E))
262 return T;
263 }
264 llvm_unreachable("event type has no associated counter");
265}
266
267class WaitcntBrackets;
268
269// This abstracts the logic for generating and updating S_WAIT* instructions
270// away from the analysis that determines where they are needed. This was
271// done because the set of counters and instructions for waiting on them
272// underwent a major shift with gfx12, sufficiently so that having this
273// abstraction allows the main analysis logic to be simpler than it would
274// otherwise have had to become.
275class WaitcntGenerator {
276protected:
277 const GCNSubtarget *ST = nullptr;
278 const SIInstrInfo *TII = nullptr;
280 InstCounterType MaxCounter;
281 bool OptNone;
282
283public:
284 WaitcntGenerator() = default;
285 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
286 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
287 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
288 OptNone(MF.getFunction().hasOptNone() ||
289 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
290
291 // Return true if the current function should be compiled with no
292 // optimization.
293 bool isOptNone() const { return OptNone; }
294
295 // Edits an existing sequence of wait count instructions according
296 // to an incoming Waitcnt value, which is itself updated to reflect
297 // any new wait count instructions which may need to be generated by
298 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
299 // were made.
300 //
301 // This editing will usually be merely updated operands, but it may also
302 // delete instructions if the incoming Wait value indicates they are not
303 // needed. It may also remove existing instructions for which a wait
304 // is needed if it can be determined that it is better to generate new
305 // instructions later, as can happen on gfx12.
306 virtual bool
307 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
308 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
310
311 // Transform a soft waitcnt into a normal one.
312 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
313
314 // Generates new wait count instructions according to the value of
315 // Wait, returning true if any new instructions were created.
316 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
319
320 // Returns an array of bit masks which can be used to map values in
321 // WaitEventType to corresponding counter values in InstCounterType.
322 virtual const unsigned *getWaitEventMask() const = 0;
323
324 // Returns a new waitcnt with all counters except VScnt set to 0. If
325 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
326 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
327
328 virtual ~WaitcntGenerator() = default;
329
330 // Create a mask value from the initializer list of wait event types.
331 static constexpr unsigned
332 eventMask(std::initializer_list<WaitEventType> Events) {
333 unsigned Mask = 0;
334 for (auto &E : Events)
335 Mask |= 1 << E;
336
337 return Mask;
338 }
339};
340
341class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
342public:
343 WaitcntGeneratorPreGFX12() = default;
344 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
345 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
346
347 bool
348 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
349 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
350 MachineBasicBlock::instr_iterator It) const override;
351
352 bool createNewWaitcnt(MachineBasicBlock &Block,
354 AMDGPU::Waitcnt Wait) override;
355
356 const unsigned *getWaitEventMask() const override {
357 assert(ST);
358
359 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
360 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
361 VMEM_BVH_READ_ACCESS}),
362 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
363 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
364 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
365 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
366 0,
367 0,
368 0,
369 0};
370
371 return WaitEventMaskForInstPreGFX12;
372 }
373
374 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
375};
376
377class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
378public:
379 WaitcntGeneratorGFX12Plus() = default;
380 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
381 InstCounterType MaxCounter)
382 : WaitcntGenerator(MF, MaxCounter) {}
383
384 bool
385 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
386 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
387 MachineBasicBlock::instr_iterator It) const override;
388
389 bool createNewWaitcnt(MachineBasicBlock &Block,
391 AMDGPU::Waitcnt Wait) override;
392
393 const unsigned *getWaitEventMask() const override {
394 assert(ST);
395
396 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
397 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
398 eventMask({LDS_ACCESS, GDS_ACCESS}),
399 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
400 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
401 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
402 eventMask({VMEM_SAMPLER_READ_ACCESS}),
403 eventMask({VMEM_BVH_READ_ACCESS}),
404 eventMask({SMEM_ACCESS, SQ_MESSAGE}),
405 eventMask({VMEM_GROUP, SMEM_GROUP})};
406
407 return WaitEventMaskForInstGFX12Plus;
408 }
409
410 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
411};
412
413class SIInsertWaitcnts {
414public:
415 const GCNSubtarget *ST;
416 InstCounterType SmemAccessCounter;
417 InstCounterType MaxCounter;
418 const unsigned *WaitEventMaskForInst;
419
420private:
421 const SIInstrInfo *TII = nullptr;
422 const SIRegisterInfo *TRI = nullptr;
423 const MachineRegisterInfo *MRI = nullptr;
424
426 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
427 MachineLoopInfo *MLI;
429 AliasAnalysis *AA = nullptr;
430
431 struct BlockInfo {
432 std::unique_ptr<WaitcntBrackets> Incoming;
433 bool Dirty = true;
434 };
435
437
438 bool ForceEmitWaitcnt[NUM_INST_CNTS];
439
440 // In any given run of this pass, WCG will point to one of these two
441 // generator objects, which must have been re-initialised before use
442 // from a value made using a subtarget constructor.
443 WaitcntGeneratorPreGFX12 WCGPreGFX12;
444 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
445
446 WaitcntGenerator *WCG = nullptr;
447
448 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
449 // message.
450 DenseSet<MachineInstr *> ReleaseVGPRInsts;
451
452 HardwareLimits Limits;
453
454public:
455 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
456 AliasAnalysis *AA)
457 : MLI(MLI), PDT(PDT), AA(AA) {
458 (void)ForceExpCounter;
459 (void)ForceLgkmCounter;
460 (void)ForceVMCounter;
461 }
462
463 unsigned getWaitCountMax(InstCounterType T) const {
464 switch (T) {
465 case LOAD_CNT:
466 return Limits.LoadcntMax;
467 case DS_CNT:
468 return Limits.DscntMax;
469 case EXP_CNT:
470 return Limits.ExpcntMax;
471 case STORE_CNT:
472 return Limits.StorecntMax;
473 case SAMPLE_CNT:
474 return Limits.SamplecntMax;
475 case BVH_CNT:
476 return Limits.BvhcntMax;
477 case KM_CNT:
478 return Limits.KmcntMax;
479 case X_CNT:
480 return Limits.XcntMax;
481 default:
482 break;
483 }
484 return 0;
485 }
486
487 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
488 bool isPreheaderToFlush(MachineBasicBlock &MBB,
489 const WaitcntBrackets &ScoreBrackets);
490 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
491 bool run(MachineFunction &MF);
492
493 bool isForceEmitWaitcnt() const {
494 for (auto T : inst_counter_types())
495 if (ForceEmitWaitcnt[T])
496 return true;
497 return false;
498 }
499
500 void setForceEmitWaitcnt() {
501// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
502// For debug builds, get the debug counter info and adjust if need be
503#ifndef NDEBUG
504 if (DebugCounter::isCounterSet(ForceExpCounter) &&
505 DebugCounter::shouldExecute(ForceExpCounter)) {
506 ForceEmitWaitcnt[EXP_CNT] = true;
507 } else {
508 ForceEmitWaitcnt[EXP_CNT] = false;
509 }
510
511 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
512 DebugCounter::shouldExecute(ForceLgkmCounter)) {
513 ForceEmitWaitcnt[DS_CNT] = true;
514 ForceEmitWaitcnt[KM_CNT] = true;
515 } else {
516 ForceEmitWaitcnt[DS_CNT] = false;
517 ForceEmitWaitcnt[KM_CNT] = false;
518 }
519
520 if (DebugCounter::isCounterSet(ForceVMCounter) &&
521 DebugCounter::shouldExecute(ForceVMCounter)) {
522 ForceEmitWaitcnt[LOAD_CNT] = true;
523 ForceEmitWaitcnt[SAMPLE_CNT] = true;
524 ForceEmitWaitcnt[BVH_CNT] = true;
525 } else {
526 ForceEmitWaitcnt[LOAD_CNT] = false;
527 ForceEmitWaitcnt[SAMPLE_CNT] = false;
528 ForceEmitWaitcnt[BVH_CNT] = false;
529 }
530#endif // NDEBUG
531 }
532
533 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
534 // instruction.
535 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
536 switch (Inst.getOpcode()) {
537 case AMDGPU::GLOBAL_INV:
538 return VMEM_READ_ACCESS; // tracked using loadcnt
539 case AMDGPU::GLOBAL_WB:
540 case AMDGPU::GLOBAL_WBINV:
541 return VMEM_WRITE_ACCESS; // tracked using storecnt
542 default:
543 break;
544 }
545
546 // Maps VMEM access types to their corresponding WaitEventType.
547 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
548 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
549
551 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
552 // these should use VM_CNT.
553 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
554 return VMEM_ACCESS;
555 if (Inst.mayStore() &&
556 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
557 // FLAT and SCRATCH instructions may access scratch. Other VMEM
558 // instructions do not.
559 if (TII->mayAccessScratchThroughFlat(Inst))
560 return SCRATCH_WRITE_ACCESS;
561 return VMEM_WRITE_ACCESS;
562 }
563 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
564 return VMEM_READ_ACCESS;
565 return VmemReadMapping[getVmemType(Inst)];
566 }
567
568 bool hasXcnt() const { return ST->hasWaitXCnt(); }
569
570 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
571 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
572 bool isVmemAccess(const MachineInstr &MI) const;
573 bool generateWaitcntInstBefore(MachineInstr &MI,
574 WaitcntBrackets &ScoreBrackets,
575 MachineInstr *OldWaitcntInstr,
576 bool FlushVmCnt);
577 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
579 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
580 MachineInstr *OldWaitcntInstr);
581 void updateEventWaitcntAfter(MachineInstr &Inst,
582 WaitcntBrackets *ScoreBrackets);
583 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
584 MachineBasicBlock *Block) const;
585 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
586 WaitcntBrackets &ScoreBrackets);
587 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
588 WaitcntBrackets &ScoreBrackets);
589};
590
591// This objects maintains the current score brackets of each wait counter, and
592// a per-register scoreboard for each wait counter.
593//
594// We also maintain the latest score for every event type that can change the
595// waitcnt in order to know if there are multiple types of events within
596// the brackets. When multiple types of event happen in the bracket,
597// wait count may get decreased out of order, therefore we need to put in
598// "s_waitcnt 0" before use.
599class WaitcntBrackets {
600public:
601 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
602
603 bool isSmemCounter(InstCounterType T) const {
604 return T == Context->SmemAccessCounter || T == X_CNT;
605 }
606
607 unsigned getSgprScoresIdx(InstCounterType T) const {
608 assert(isSmemCounter(T) && "Invalid SMEM counter");
609 return T == X_CNT ? 1 : 0;
610 }
611
612 unsigned getScoreLB(InstCounterType T) const {
613 assert(T < NUM_INST_CNTS);
614 return ScoreLBs[T];
615 }
616
617 unsigned getScoreUB(InstCounterType T) const {
618 assert(T < NUM_INST_CNTS);
619 return ScoreUBs[T];
620 }
621
622 unsigned getScoreRange(InstCounterType T) const {
623 return getScoreUB(T) - getScoreLB(T);
624 }
625
626 unsigned getRegScore(int GprNo, InstCounterType T) const {
627 if (GprNo < NUM_ALL_VGPRS)
628 return VgprScores[T][GprNo];
629 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
630 }
631
632 bool merge(const WaitcntBrackets &Other);
633
634 RegInterval getRegInterval(const MachineInstr *MI,
636 const SIRegisterInfo *TRI,
637 const MachineOperand &Op) const;
638
639 bool counterOutOfOrder(InstCounterType T) const;
640 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
641 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
642
643 void determineWait(InstCounterType T, RegInterval Interval,
644 AMDGPU::Waitcnt &Wait) const;
645 void determineWait(InstCounterType T, int RegNo,
646 AMDGPU::Waitcnt &Wait) const {
647 determineWait(T, {RegNo, RegNo + 1}, Wait);
648 }
649
650 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
651 void applyWaitcnt(InstCounterType T, unsigned Count);
652 void applyXcnt(const AMDGPU::Waitcnt &Wait);
653 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
654 const MachineRegisterInfo *MRI, WaitEventType E,
656
657 unsigned hasPendingEvent() const { return PendingEvents; }
658 unsigned hasPendingEvent(WaitEventType E) const {
659 return PendingEvents & (1 << E);
660 }
661 unsigned hasPendingEvent(InstCounterType T) const {
662 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
663 assert((HasPending != 0) == (getScoreRange(T) != 0));
664 return HasPending;
665 }
666
667 bool hasMixedPendingEvents(InstCounterType T) const {
668 unsigned Events = hasPendingEvent(T);
669 // Return true if more than one bit is set in Events.
670 return Events & (Events - 1);
671 }
672
673 bool hasPendingFlat() const {
674 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
675 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
676 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
677 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
678 }
679
680 void setPendingFlat() {
681 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
682 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
683 }
684
685 bool hasPendingGDS() const {
686 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
687 }
688
689 unsigned getPendingGDSWait() const {
690 return std::min(getScoreUB(DS_CNT) - LastGDS,
691 Context->getWaitCountMax(DS_CNT) - 1);
692 }
693
694 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
695
696 // Return true if there might be pending writes to the vgpr-interval by VMEM
697 // instructions with types different from V.
698 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
699 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
700 assert(RegNo < NUM_ALL_VGPRS);
701 if (VgprVmemTypes[RegNo] & ~(1 << V))
702 return true;
703 }
704 return false;
705 }
706
707 void clearVgprVmemTypes(RegInterval Interval) {
708 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
709 assert(RegNo < NUM_ALL_VGPRS);
710 VgprVmemTypes[RegNo] = 0;
711 }
712 }
713
714 void setStateOnFunctionEntryOrReturn() {
715 setScoreUB(STORE_CNT,
716 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
717 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
718 }
719
720 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
721 return LDSDMAStores;
722 }
723
724 bool hasPointSampleAccel(const MachineInstr &MI) const;
725 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
726 RegInterval Interval) const;
727
728 void print(raw_ostream &) const;
729 void dump() const { print(dbgs()); }
730
731private:
732 struct MergeInfo {
733 unsigned OldLB;
734 unsigned OtherLB;
735 unsigned MyShift;
736 unsigned OtherShift;
737 };
738 static bool mergeScore(const MergeInfo &M, unsigned &Score,
739 unsigned OtherScore);
740
741 void setScoreLB(InstCounterType T, unsigned Val) {
742 assert(T < NUM_INST_CNTS);
743 ScoreLBs[T] = Val;
744 }
745
746 void setScoreUB(InstCounterType T, unsigned Val) {
747 assert(T < NUM_INST_CNTS);
748 ScoreUBs[T] = Val;
749
750 if (T != EXP_CNT)
751 return;
752
753 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
754 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
755 }
756
757 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
758 setScoreByInterval({GprNo, GprNo + 1}, T, Val);
759 }
760
761 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
762 unsigned Score);
763
764 void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
766 const MachineOperand &Op, InstCounterType CntTy,
767 unsigned Val);
768
769 const SIInsertWaitcnts *Context;
770
771 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
772 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
773 unsigned PendingEvents = 0;
774 // Remember the last flat memory operation.
775 unsigned LastFlat[NUM_INST_CNTS] = {0};
776 // Remember the last GDS operation.
777 unsigned LastGDS = 0;
778 // wait_cnt scores for every vgpr.
779 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
780 int VgprUB = -1;
781 int SgprUB = -1;
782 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
783 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
784 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
785 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
786 // X_CNT score.
787 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
788 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
789 // write to each vgpr.
790 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
791 // Store representative LDS DMA operations. The only useful info here is
792 // alias info. One store is kept per unique AAInfo.
793 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
794};
795
796class SIInsertWaitcntsLegacy : public MachineFunctionPass {
797public:
798 static char ID;
799 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
800
801 bool runOnMachineFunction(MachineFunction &MF) override;
802
803 StringRef getPassName() const override {
804 return "SI insert wait instructions";
805 }
806
807 void getAnalysisUsage(AnalysisUsage &AU) const override {
808 AU.setPreservesCFG();
814 }
815};
816
817} // end anonymous namespace
818
819RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
821 const SIRegisterInfo *TRI,
822 const MachineOperand &Op) const {
823 if (!TRI->isInAllocatableClass(Op.getReg()))
824 return {-1, -1};
825
826 // A use via a PW operand does not need a waitcnt.
827 // A partial write is not a WAW.
828 assert(!Op.getSubReg() || !Op.isUndef());
829
830 RegInterval Result;
831
832 MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
833 unsigned RegIdx = TRI->getHWRegIndex(MCReg);
834 assert(isUInt<8>(RegIdx));
835
836 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
837 unsigned Size = TRI->getRegSizeInBits(*RC);
838
839 // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
840 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
841 unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
842 assert(Reg < AGPR_OFFSET);
843 Result.first = Reg;
844 if (TRI->isAGPR(*MRI, Op.getReg()))
845 Result.first += AGPR_OFFSET;
846 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
847 assert(Size % 16 == 0);
848 Result.second = Result.first + (Size / 16);
849 } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
850 // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
851 // sources like SRC_PRIVATE_BASE.
852 Result.first = RegIdx + NUM_ALL_VGPRS;
853 Result.second = Result.first + divideCeil(Size, 32);
854 } else {
855 return {-1, -1};
856 }
857
858 return Result;
859}
860
861void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
862 InstCounterType CntTy,
863 unsigned Score) {
864 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
865 if (RegNo < NUM_ALL_VGPRS) {
866 VgprUB = std::max(VgprUB, RegNo);
867 VgprScores[CntTy][RegNo] = Score;
868 } else {
869 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
870 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
871 }
872 }
873}
874
875void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
876 const SIRegisterInfo *TRI,
878 const MachineOperand &Op,
879 InstCounterType CntTy, unsigned Score) {
880 RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
881 setScoreByInterval(Interval, CntTy, Score);
882}
883
884// Return true if the subtarget is one that enables Point Sample Acceleration
885// and the MachineInstr passed in is one to which it might be applied (the
886// hardware makes this decision based on several factors, but we can't determine
887// this at compile time, so we have to assume it might be applied if the
888// instruction supports it).
889bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
890 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
891 return false;
892
893 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
894 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
896 return BaseInfo->PointSampleAccel;
897}
898
899// Return true if the subtarget enables Point Sample Acceleration, the supplied
900// MachineInstr is one to which it might be applied and the supplied interval is
901// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
902// (this is the type that a point sample accelerated instruction effectively
903// becomes)
904bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
905 const MachineInstr &MI, RegInterval Interval) const {
906 if (!hasPointSampleAccel(MI))
907 return false;
908
909 return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
910}
911
912void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
913 const SIRegisterInfo *TRI,
915 WaitEventType E, MachineInstr &Inst) {
916 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
917
918 unsigned UB = getScoreUB(T);
919 unsigned CurrScore = UB + 1;
920 if (CurrScore == 0)
921 report_fatal_error("InsertWaitcnt score wraparound");
922 // PendingEvents and ScoreUB need to be update regardless if this event
923 // changes the score of a register or not.
924 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
925 PendingEvents |= 1 << E;
926 setScoreUB(T, CurrScore);
927
928 if (T == EXP_CNT) {
929 // Put score on the source vgprs. If this is a store, just use those
930 // specific register(s).
931 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
932 // All GDS operations must protect their address register (same as
933 // export.)
934 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
935 setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
936
937 if (Inst.mayStore()) {
938 if (const auto *Data0 =
939 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
940 setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
941 if (const auto *Data1 =
942 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
943 setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
944 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
945 Inst.getOpcode() != AMDGPU::DS_APPEND &&
946 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
947 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
948 for (const MachineOperand &Op : Inst.all_uses()) {
949 if (TRI->isVectorRegister(*MRI, Op.getReg()))
950 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
951 }
952 }
953 } else if (TII->isFLAT(Inst)) {
954 if (Inst.mayStore()) {
955 setScoreByOperand(&Inst, TRI, MRI,
956 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
957 EXP_CNT, CurrScore);
958 } else if (SIInstrInfo::isAtomicRet(Inst)) {
959 setScoreByOperand(&Inst, TRI, MRI,
960 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
961 EXP_CNT, CurrScore);
962 }
963 } else if (TII->isMIMG(Inst)) {
964 if (Inst.mayStore()) {
965 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
966 CurrScore);
967 } else if (SIInstrInfo::isAtomicRet(Inst)) {
968 setScoreByOperand(&Inst, TRI, MRI,
969 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
970 EXP_CNT, CurrScore);
971 }
972 } else if (TII->isMTBUF(Inst)) {
973 if (Inst.mayStore())
974 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
975 CurrScore);
976 } else if (TII->isMUBUF(Inst)) {
977 if (Inst.mayStore()) {
978 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
979 CurrScore);
980 } else if (SIInstrInfo::isAtomicRet(Inst)) {
981 setScoreByOperand(&Inst, TRI, MRI,
982 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
983 EXP_CNT, CurrScore);
984 }
985 } else if (TII->isLDSDIR(Inst)) {
986 // LDSDIR instructions attach the score to the destination.
987 setScoreByOperand(&Inst, TRI, MRI,
988 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
989 EXP_CNT, CurrScore);
990 } else {
991 if (TII->isEXP(Inst)) {
992 // For export the destination registers are really temps that
993 // can be used as the actual source after export patching, so
994 // we need to treat them like sources and set the EXP_CNT
995 // score.
996 for (MachineOperand &DefMO : Inst.all_defs()) {
997 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
998 setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
999 }
1000 }
1001 }
1002 for (const MachineOperand &Op : Inst.all_uses()) {
1003 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1004 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
1005 }
1006 }
1007 } else if (T == X_CNT) {
1008 for (const MachineOperand &Op : Inst.all_uses())
1009 setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
1010 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1011 // Match the score to the destination registers.
1012 //
1013 // Check only explicit operands. Stores, especially spill stores, include
1014 // implicit uses and defs of their super registers which would create an
1015 // artificial dependency, while these are there only for register liveness
1016 // accounting purposes.
1017 //
1018 // Special cases where implicit register defs exists, such as M0 or VCC,
1019 // but none with memory instructions.
1020 for (const MachineOperand &Op : Inst.defs()) {
1021 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
1022 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1023 if (Interval.first >= NUM_ALL_VGPRS)
1024 continue;
1025 if (updateVMCntOnly(Inst)) {
1026 // updateVMCntOnly should only leave us with VGPRs
1027 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1028 // defs. That's required for a sane index into `VgprMemTypes` below
1029 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1030 VmemType V = getVmemType(Inst);
1031 unsigned char TypesMask = 1 << V;
1032 // If instruction can have Point Sample Accel applied, we have to flag
1033 // this with another potential dependency
1034 if (hasPointSampleAccel(Inst))
1035 TypesMask |= 1 << VMEM_NOSAMPLER;
1036 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1037 VgprVmemTypes[RegNo] |= TypesMask;
1038 }
1039 }
1040 setScoreByInterval(Interval, T, CurrScore);
1041 }
1042 if (Inst.mayStore() &&
1043 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1044 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1045 // written can be accessed. A load from LDS to VMEM does not need a wait.
1046 unsigned Slot = 0;
1047 for (const auto *MemOp : Inst.memoperands()) {
1048 if (!MemOp->isStore() ||
1049 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1050 continue;
1051 // Comparing just AA info does not guarantee memoperands are equal
1052 // in general, but this is so for LDS DMA in practice.
1053 auto AAI = MemOp->getAAInfo();
1054 // Alias scope information gives a way to definitely identify an
1055 // original memory object and practically produced in the module LDS
1056 // lowering pass. If there is no scope available we will not be able
1057 // to disambiguate LDS aliasing as after the module lowering all LDS
1058 // is squashed into a single big object. Do not attempt to use one of
1059 // the limited LDSDMAStores for something we will not be able to use
1060 // anyway.
1061 if (!AAI || !AAI.Scope)
1062 break;
1063 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1064 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1065 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1066 Slot = I + 1;
1067 break;
1068 }
1069 }
1070 }
1071 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1072 break;
1073 LDSDMAStores.push_back(&Inst);
1074 Slot = LDSDMAStores.size();
1075 break;
1076 }
1077 setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
1078 if (Slot)
1079 setRegScore(FIRST_LDS_VGPR, T, CurrScore);
1080 }
1081 }
1082}
1083
1084void WaitcntBrackets::print(raw_ostream &OS) const {
1085 const GCNSubtarget *ST = Context->ST;
1086
1087 OS << '\n';
1088 for (auto T : inst_counter_types(Context->MaxCounter)) {
1089 unsigned SR = getScoreRange(T);
1090
1091 switch (T) {
1092 case LOAD_CNT:
1093 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1094 << SR << "): ";
1095 break;
1096 case DS_CNT:
1097 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1098 << SR << "): ";
1099 break;
1100 case EXP_CNT:
1101 OS << " EXP_CNT(" << SR << "): ";
1102 break;
1103 case STORE_CNT:
1104 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1105 << SR << "): ";
1106 break;
1107 case SAMPLE_CNT:
1108 OS << " SAMPLE_CNT(" << SR << "): ";
1109 break;
1110 case BVH_CNT:
1111 OS << " BVH_CNT(" << SR << "): ";
1112 break;
1113 case KM_CNT:
1114 OS << " KM_CNT(" << SR << "): ";
1115 break;
1116 case X_CNT:
1117 OS << " X_CNT(" << SR << "): ";
1118 break;
1119 default:
1120 OS << " UNKNOWN(" << SR << "): ";
1121 break;
1122 }
1123
1124 if (SR != 0) {
1125 // Print vgpr scores.
1126 unsigned LB = getScoreLB(T);
1127
1128 for (int J = 0; J <= VgprUB; J++) {
1129 unsigned RegScore = getRegScore(J, T);
1130 if (RegScore <= LB)
1131 continue;
1132 unsigned RelScore = RegScore - LB - 1;
1133 if (J < FIRST_LDS_VGPR) {
1134 OS << RelScore << ":v" << J << " ";
1135 } else {
1136 OS << RelScore << ":ds ";
1137 }
1138 }
1139 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1140 if (isSmemCounter(T)) {
1141 for (int J = 0; J <= SgprUB; J++) {
1142 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1143 if (RegScore <= LB)
1144 continue;
1145 unsigned RelScore = RegScore - LB - 1;
1146 OS << RelScore << ":s" << J << " ";
1147 }
1148 }
1149 }
1150 OS << '\n';
1151 }
1152
1153 OS << "Pending Events: ";
1154 if (hasPendingEvent()) {
1155 ListSeparator LS;
1156 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1157 if (hasPendingEvent((WaitEventType)I)) {
1158 OS << LS << WaitEventTypeName[I];
1159 }
1160 }
1161 } else {
1162 OS << "none";
1163 }
1164 OS << '\n';
1165
1166 OS << '\n';
1167}
1168
1169/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1170/// whether a waitcnt instruction is needed at all.
1171void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1172 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1173 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1174 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1175 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1176 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1177 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1178 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1179 simplifyWaitcnt(X_CNT, Wait.XCnt);
1180}
1181
1182void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1183 unsigned &Count) const {
1184 // The number of outstanding events for this type, T, can be calculated
1185 // as (UB - LB). If the current Count is greater than or equal to the number
1186 // of outstanding events, then the wait for this counter is redundant.
1187 if (Count >= getScoreRange(T))
1188 Count = ~0u;
1189}
1190
1191void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1192 AMDGPU::Waitcnt &Wait) const {
1193 const unsigned LB = getScoreLB(T);
1194 const unsigned UB = getScoreUB(T);
1195 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1196 unsigned ScoreToWait = getRegScore(RegNo, T);
1197
1198 // If the score of src_operand falls within the bracket, we need an
1199 // s_waitcnt instruction.
1200 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1201 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1202 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1203 // If there is a pending FLAT operation, and this is a VMem or LGKM
1204 // waitcnt and the target can report early completion, then we need
1205 // to force a waitcnt 0.
1206 addWait(Wait, T, 0);
1207 } else if (counterOutOfOrder(T)) {
1208 // Counter can get decremented out-of-order when there
1209 // are multiple types event in the bracket. Also emit an s_wait counter
1210 // with a conservative value of 0 for the counter.
1211 addWait(Wait, T, 0);
1212 } else {
1213 // If a counter has been maxed out avoid overflow by waiting for
1214 // MAX(CounterType) - 1 instead.
1215 unsigned NeededWait =
1216 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1217 addWait(Wait, T, NeededWait);
1218 }
1219 }
1220 }
1221}
1222
1223void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1224 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1225 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1226 applyWaitcnt(DS_CNT, Wait.DsCnt);
1227 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1228 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1229 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1230 applyWaitcnt(KM_CNT, Wait.KmCnt);
1231 applyXcnt(Wait);
1232}
1233
1234void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1235 const unsigned UB = getScoreUB(T);
1236 if (Count >= UB)
1237 return;
1238 if (Count != 0) {
1239 if (counterOutOfOrder(T))
1240 return;
1241 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1242 } else {
1243 setScoreLB(T, UB);
1244 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1245 }
1246}
1247
1248void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1249 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1250 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1251 // zero.
1252 if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1253 return applyWaitcnt(X_CNT, 0);
1254
1255 // If we have pending store we cannot optimize XCnt because we do not wait for
1256 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1257 // decremented to the same number as LOADCnt.
1258 if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1259 !hasPendingEvent(STORE_CNT))
1260 return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1261
1262 applyWaitcnt(X_CNT, Wait.XCnt);
1263}
1264
1265// Where there are multiple types of event in the bracket of a counter,
1266// the decrement may go out of order.
1267bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1268 // Scalar memory read always can go out of order.
1269 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1270 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1271 return true;
1272 return hasMixedPendingEvents(T);
1273}
1274
1275INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1276 false, false)
1279INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1281
1282char SIInsertWaitcntsLegacy::ID = 0;
1283
1284char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1285
1287 return new SIInsertWaitcntsLegacy();
1288}
1289
1290static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1291 unsigned NewEnc) {
1292 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1293 assert(OpIdx >= 0);
1294
1295 MachineOperand &MO = MI.getOperand(OpIdx);
1296
1297 if (NewEnc == MO.getImm())
1298 return false;
1299
1300 MO.setImm(NewEnc);
1301 return true;
1302}
1303
1304/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1305/// and if so, which counter it is waiting on.
1306static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1307 switch (Opcode) {
1308 case AMDGPU::S_WAIT_LOADCNT:
1309 return LOAD_CNT;
1310 case AMDGPU::S_WAIT_EXPCNT:
1311 return EXP_CNT;
1312 case AMDGPU::S_WAIT_STORECNT:
1313 return STORE_CNT;
1314 case AMDGPU::S_WAIT_SAMPLECNT:
1315 return SAMPLE_CNT;
1316 case AMDGPU::S_WAIT_BVHCNT:
1317 return BVH_CNT;
1318 case AMDGPU::S_WAIT_DSCNT:
1319 return DS_CNT;
1320 case AMDGPU::S_WAIT_KMCNT:
1321 return KM_CNT;
1322 case AMDGPU::S_WAIT_XCNT:
1323 return X_CNT;
1324 default:
1325 return {};
1326 }
1327}
1328
1329bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1330 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1331 if (Opcode == Waitcnt->getOpcode())
1332 return false;
1333
1334 Waitcnt->setDesc(TII->get(Opcode));
1335 return true;
1336}
1337
1338/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1339/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1340/// from \p Wait that were added by previous passes. Currently this pass
1341/// conservatively assumes that these preexisting waits are required for
1342/// correctness.
1343bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1344 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1346 assert(ST);
1347 assert(isNormalMode(MaxCounter));
1348
1349 bool Modified = false;
1350 MachineInstr *WaitcntInstr = nullptr;
1351 MachineInstr *WaitcntVsCntInstr = nullptr;
1352
1353 LLVM_DEBUG({
1354 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1355 if (It == OldWaitcntInstr.getParent()->instr_end())
1356 dbgs() << "end of block\n";
1357 else
1358 dbgs() << *It;
1359 });
1360
1361 for (auto &II :
1362 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1363 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1364 if (II.isMetaInstruction()) {
1365 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1366 continue;
1367 }
1368
1369 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1370 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1371
1372 // Update required wait count. If this is a soft waitcnt (= it was added
1373 // by an earlier pass), it may be entirely removed.
1374 if (Opcode == AMDGPU::S_WAITCNT) {
1375 unsigned IEnc = II.getOperand(0).getImm();
1376 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1377 if (TrySimplify)
1378 ScoreBrackets.simplifyWaitcnt(OldWait);
1379 Wait = Wait.combined(OldWait);
1380
1381 // Merge consecutive waitcnt of the same type by erasing multiples.
1382 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1383 II.eraseFromParent();
1384 Modified = true;
1385 } else
1386 WaitcntInstr = &II;
1387 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1388 assert(ST->hasVMemToLDSLoad());
1389 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1390 << "Before: " << Wait.LoadCnt << '\n';);
1391 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1392 LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
1393
1394 // It is possible (but unlikely) that this is the only wait instruction,
1395 // in which case, we exit this loop without a WaitcntInstr to consume
1396 // `Wait`. But that works because `Wait` was passed in by reference, and
1397 // the callee eventually calls createNewWaitcnt on it. We test this
1398 // possibility in an articial MIR test since such a situation cannot be
1399 // recreated by running the memory legalizer.
1400 II.eraseFromParent();
1401 } else {
1402 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1403 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1404
1405 unsigned OldVSCnt =
1406 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1407 if (TrySimplify)
1408 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1409 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1410
1411 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1412 II.eraseFromParent();
1413 Modified = true;
1414 } else
1415 WaitcntVsCntInstr = &II;
1416 }
1417 }
1418
1419 if (WaitcntInstr) {
1420 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1422 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1423
1424 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1425 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1426 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1427 Wait.LoadCnt = ~0u;
1428 Wait.ExpCnt = ~0u;
1429 Wait.DsCnt = ~0u;
1430
1431 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1432 ? dbgs()
1433 << "applied pre-existing waitcnt\n"
1434 << "New Instr at block end: " << *WaitcntInstr << '\n'
1435 : dbgs() << "applied pre-existing waitcnt\n"
1436 << "Old Instr: " << *It
1437 << "New Instr: " << *WaitcntInstr << '\n');
1438 }
1439
1440 if (WaitcntVsCntInstr) {
1441 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1442 AMDGPU::OpName::simm16, Wait.StoreCnt);
1443 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1444
1445 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1446 Wait.StoreCnt = ~0u;
1447
1448 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1449 ? dbgs() << "applied pre-existing waitcnt\n"
1450 << "New Instr at block end: " << *WaitcntVsCntInstr
1451 << '\n'
1452 : dbgs() << "applied pre-existing waitcnt\n"
1453 << "Old Instr: " << *It
1454 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1455 }
1456
1457 return Modified;
1458}
1459
1460/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1461/// required counters in \p Wait
1462bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1465 assert(ST);
1466 assert(isNormalMode(MaxCounter));
1467
1468 bool Modified = false;
1469 const DebugLoc &DL = Block.findDebugLoc(It);
1470
1471 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1472 // single instruction while VScnt has its own instruction.
1473 if (Wait.hasWaitExceptStoreCnt()) {
1474 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1475 [[maybe_unused]] auto SWaitInst =
1476 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1477 Modified = true;
1478
1479 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1480 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1481 dbgs() << "New Instr: " << *SWaitInst << '\n');
1482 }
1483
1484 if (Wait.hasWaitStoreCnt()) {
1485 assert(ST->hasVscnt());
1486
1487 [[maybe_unused]] auto SWaitInst =
1488 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1489 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1490 .addImm(Wait.StoreCnt);
1491 Modified = true;
1492
1493 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1494 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1495 dbgs() << "New Instr: " << *SWaitInst << '\n');
1496 }
1497
1498 return Modified;
1499}
1500
1502WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1503 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1504}
1505
1507WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1508 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1509 ~0u /* XCNT */);
1510}
1511
1512/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1513/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1514/// were added by previous passes. Currently this pass conservatively
1515/// assumes that these preexisting waits are required for correctness.
1516bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1517 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1519 assert(ST);
1520 assert(!isNormalMode(MaxCounter));
1521
1522 bool Modified = false;
1523 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1524 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1525 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1526
1527 LLVM_DEBUG({
1528 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1529 if (It == OldWaitcntInstr.getParent()->instr_end())
1530 dbgs() << "end of block\n";
1531 else
1532 dbgs() << *It;
1533 });
1534
1535 for (auto &II :
1536 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1537 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1538 if (II.isMetaInstruction()) {
1539 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1540 continue;
1541 }
1542
1543 MachineInstr **UpdatableInstr;
1544
1545 // Update required wait count. If this is a soft waitcnt (= it was added
1546 // by an earlier pass), it may be entirely removed.
1547
1548 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1549 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1550
1551 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1552 // attempt to do more than that either.
1553 if (Opcode == AMDGPU::S_WAITCNT)
1554 continue;
1555
1556 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1557 unsigned OldEnc =
1558 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1560 if (TrySimplify)
1561 ScoreBrackets.simplifyWaitcnt(OldWait);
1562 Wait = Wait.combined(OldWait);
1563 UpdatableInstr = &CombinedLoadDsCntInstr;
1564 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1565 unsigned OldEnc =
1566 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1568 if (TrySimplify)
1569 ScoreBrackets.simplifyWaitcnt(OldWait);
1570 Wait = Wait.combined(OldWait);
1571 UpdatableInstr = &CombinedStoreDsCntInstr;
1572 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1573 // Architectures higher than GFX10 do not have direct loads to
1574 // LDS, so no work required here yet.
1575 II.eraseFromParent();
1576 continue;
1577 } else {
1578 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1579 assert(CT.has_value());
1580 unsigned OldCnt =
1581 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1582 if (TrySimplify)
1583 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1584 addWait(Wait, CT.value(), OldCnt);
1585 UpdatableInstr = &WaitInstrs[CT.value()];
1586 }
1587
1588 // Merge consecutive waitcnt of the same type by erasing multiples.
1589 if (!*UpdatableInstr) {
1590 *UpdatableInstr = &II;
1591 } else {
1592 II.eraseFromParent();
1593 Modified = true;
1594 }
1595 }
1596
1597 if (CombinedLoadDsCntInstr) {
1598 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1599 // to be waited for. Otherwise, let the instruction be deleted so
1600 // the appropriate single counter wait instruction can be inserted
1601 // instead, when new S_WAIT_*CNT instructions are inserted by
1602 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1603 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1604 // the loop below that deals with single counter instructions.
1605 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1606 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1607 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1608 AMDGPU::OpName::simm16, NewEnc);
1609 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1610 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1611 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1612 Wait.LoadCnt = ~0u;
1613 Wait.DsCnt = ~0u;
1614
1615 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1616 ? dbgs() << "applied pre-existing waitcnt\n"
1617 << "New Instr at block end: "
1618 << *CombinedLoadDsCntInstr << '\n'
1619 : dbgs() << "applied pre-existing waitcnt\n"
1620 << "Old Instr: " << *It << "New Instr: "
1621 << *CombinedLoadDsCntInstr << '\n');
1622 } else {
1623 CombinedLoadDsCntInstr->eraseFromParent();
1624 Modified = true;
1625 }
1626 }
1627
1628 if (CombinedStoreDsCntInstr) {
1629 // Similarly for S_WAIT_STORECNT_DSCNT.
1630 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1631 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1632 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1633 AMDGPU::OpName::simm16, NewEnc);
1634 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1635 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1636 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1637 Wait.StoreCnt = ~0u;
1638 Wait.DsCnt = ~0u;
1639
1640 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1641 ? dbgs() << "applied pre-existing waitcnt\n"
1642 << "New Instr at block end: "
1643 << *CombinedStoreDsCntInstr << '\n'
1644 : dbgs() << "applied pre-existing waitcnt\n"
1645 << "Old Instr: " << *It << "New Instr: "
1646 << *CombinedStoreDsCntInstr << '\n');
1647 } else {
1648 CombinedStoreDsCntInstr->eraseFromParent();
1649 Modified = true;
1650 }
1651 }
1652
1653 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1654 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1655 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1656 // instructions so that createNewWaitcnt() will create new combined
1657 // instructions to replace them.
1658
1659 if (Wait.DsCnt != ~0u) {
1660 // This is a vector of addresses in WaitInstrs pointing to instructions
1661 // that should be removed if they are present.
1663
1664 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1665 // both) need to be waited for, ensure that there are no existing
1666 // individual wait count instructions for these.
1667
1668 if (Wait.LoadCnt != ~0u) {
1669 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1670 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1671 } else if (Wait.StoreCnt != ~0u) {
1672 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1673 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1674 }
1675
1676 for (MachineInstr **WI : WaitsToErase) {
1677 if (!*WI)
1678 continue;
1679
1680 (*WI)->eraseFromParent();
1681 *WI = nullptr;
1682 Modified = true;
1683 }
1684 }
1685
1686 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1687 if (!WaitInstrs[CT])
1688 continue;
1689
1690 unsigned NewCnt = getWait(Wait, CT);
1691 if (NewCnt != ~0u) {
1692 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1693 AMDGPU::OpName::simm16, NewCnt);
1694 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1695
1696 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1697 setNoWait(Wait, CT);
1698
1699 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1700 ? dbgs() << "applied pre-existing waitcnt\n"
1701 << "New Instr at block end: " << *WaitInstrs[CT]
1702 << '\n'
1703 : dbgs() << "applied pre-existing waitcnt\n"
1704 << "Old Instr: " << *It
1705 << "New Instr: " << *WaitInstrs[CT] << '\n');
1706 } else {
1707 WaitInstrs[CT]->eraseFromParent();
1708 Modified = true;
1709 }
1710 }
1711
1712 return Modified;
1713}
1714
1715/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1716bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1719 assert(ST);
1720 assert(!isNormalMode(MaxCounter));
1721
1722 bool Modified = false;
1723 const DebugLoc &DL = Block.findDebugLoc(It);
1724
1725 // Check for opportunities to use combined wait instructions.
1726 if (Wait.DsCnt != ~0u) {
1727 MachineInstr *SWaitInst = nullptr;
1728
1729 if (Wait.LoadCnt != ~0u) {
1730 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1731
1732 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1733 .addImm(Enc);
1734
1735 Wait.LoadCnt = ~0u;
1736 Wait.DsCnt = ~0u;
1737 } else if (Wait.StoreCnt != ~0u) {
1738 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1739
1740 SWaitInst =
1741 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1742 .addImm(Enc);
1743
1744 Wait.StoreCnt = ~0u;
1745 Wait.DsCnt = ~0u;
1746 }
1747
1748 if (SWaitInst) {
1749 Modified = true;
1750
1751 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1752 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1753 dbgs() << "New Instr: " << *SWaitInst << '\n');
1754 }
1755 }
1756
1757 // Generate an instruction for any remaining counter that needs
1758 // waiting for.
1759
1760 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1761 unsigned Count = getWait(Wait, CT);
1762 if (Count == ~0u)
1763 continue;
1764
1765 [[maybe_unused]] auto SWaitInst =
1766 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1767 .addImm(Count);
1768
1769 Modified = true;
1770
1771 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1772 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1773 dbgs() << "New Instr: " << *SWaitInst << '\n');
1774 }
1775
1776 return Modified;
1777}
1778
1779static bool readsVCCZ(const MachineInstr &MI) {
1780 unsigned Opc = MI.getOpcode();
1781 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1782 !MI.getOperand(1).isUndef();
1783}
1784
1785/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1787 // Currently all conventions wait, but this may not always be the case.
1788 //
1789 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1790 // senses to omit the wait and do it in the caller.
1791 return true;
1792}
1793
1794/// \returns true if the callee is expected to wait for any outstanding waits
1795/// before returning.
1796static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1797
1798/// Generate s_waitcnt instruction to be placed before cur_Inst.
1799/// Instructions of a given type are returned in order,
1800/// but instructions of different types can complete out of order.
1801/// We rely on this in-order completion
1802/// and simply assign a score to the memory access instructions.
1803/// We keep track of the active "score bracket" to determine
1804/// if an access of a memory read requires an s_waitcnt
1805/// and if so what the value of each counter is.
1806/// The "score bracket" is bound by the lower bound and upper bound
1807/// scores (*_score_LB and *_score_ub respectively).
1808/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1809/// flush the vmcnt counter here.
1810bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1811 WaitcntBrackets &ScoreBrackets,
1812 MachineInstr *OldWaitcntInstr,
1813 bool FlushVmCnt) {
1814 setForceEmitWaitcnt();
1815
1816 assert(!MI.isMetaInstruction());
1817
1819
1820 // FIXME: This should have already been handled by the memory legalizer.
1821 // Removing this currently doesn't affect any lit tests, but we need to
1822 // verify that nothing was relying on this. The number of buffer invalidates
1823 // being handled here should not be expanded.
1824 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1825 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1826 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1827 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1828 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1829 Wait.LoadCnt = 0;
1830 }
1831
1832 // All waits must be resolved at call return.
1833 // NOTE: this could be improved with knowledge of all call sites or
1834 // with knowledge of the called routines.
1835 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1836 MI.getOpcode() == AMDGPU::SI_RETURN ||
1837 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1838 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1839 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1840 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1841 }
1842 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1843 // Technically the hardware will do this on its own if we don't, but that
1844 // might cost extra cycles compared to doing it explicitly.
1845 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1846 // have to wait for outstanding VMEM stores. In this case it can be useful to
1847 // send a message to explicitly release all VGPRs before the stores have
1848 // completed, but it is only safe to do this if there are no outstanding
1849 // scratch stores.
1850 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1851 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1852 if (!WCG->isOptNone() &&
1853 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1854 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1855 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1856 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1857 ReleaseVGPRInsts.insert(&MI);
1858 }
1859 // Resolve vm waits before gs-done.
1860 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1861 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1862 ST->hasLegacyGeometry() &&
1863 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1865 Wait.LoadCnt = 0;
1866 }
1867
1868 // Export & GDS instructions do not read the EXEC mask until after the export
1869 // is granted (which can occur well after the instruction is issued).
1870 // The shader program must flush all EXP operations on the export-count
1871 // before overwriting the EXEC mask.
1872 else {
1873 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1874 // Export and GDS are tracked individually, either may trigger a waitcnt
1875 // for EXEC.
1876 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1877 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1878 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1879 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1880 Wait.ExpCnt = 0;
1881 }
1882 }
1883
1884 // Wait for any pending GDS instruction to complete before any
1885 // "Always GDS" instruction.
1886 if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1887 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1888
1889 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1890 // The function is going to insert a wait on everything in its prolog.
1891 // This still needs to be careful if the call target is a load (e.g. a GOT
1892 // load). We also need to check WAW dependency with saved PC.
1894
1895 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1896 if (CallAddrOp.isReg()) {
1897 RegInterval CallAddrOpInterval =
1898 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
1899
1900 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1901 Wait);
1902
1903 if (const auto *RtnAddrOp =
1904 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1905 RegInterval RtnAddrOpInterval =
1906 ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
1907
1908 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1909 Wait);
1910 }
1911 }
1912 } else {
1913 // FIXME: Should not be relying on memoperands.
1914 // Look at the source operands of every instruction to see if
1915 // any of them results from a previous memory operation that affects
1916 // its current usage. If so, an s_waitcnt instruction needs to be
1917 // emitted.
1918 // If the source operand was defined by a load, add the s_waitcnt
1919 // instruction.
1920 //
1921 // Two cases are handled for destination operands:
1922 // 1) If the destination operand was defined by a load, add the s_waitcnt
1923 // instruction to guarantee the right WAW order.
1924 // 2) If a destination operand that was used by a recent export/store ins,
1925 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1926
1927 for (const MachineMemOperand *Memop : MI.memoperands()) {
1928 const Value *Ptr = Memop->getValue();
1929 if (Memop->isStore()) {
1930 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
1931 addWait(Wait, SmemAccessCounter, 0);
1932 if (PDT->dominates(MI.getParent(), It->second))
1933 SLoadAddresses.erase(It);
1934 }
1935 }
1936 unsigned AS = Memop->getAddrSpace();
1938 continue;
1939 // No need to wait before load from VMEM to LDS.
1940 if (TII->mayWriteLDSThroughDMA(MI))
1941 continue;
1942
1943 // LOAD_CNT is only relevant to vgpr or LDS.
1944 unsigned RegNo = FIRST_LDS_VGPR;
1945 // Only objects with alias scope info were added to LDSDMAScopes array.
1946 // In the absense of the scope info we will not be able to disambiguate
1947 // aliasing here. There is no need to try searching for a corresponding
1948 // store slot. This is conservatively correct because in that case we
1949 // will produce a wait using the first (general) LDS DMA wait slot which
1950 // will wait on all of them anyway.
1951 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1952 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1953 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1954 if (MI.mayAlias(AA, *LDSDMAStores[I], true))
1955 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1956 }
1957 } else {
1958 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1959 }
1960 if (Memop->isStore()) {
1961 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1962 }
1963 }
1964
1965 // Loop over use and def operands.
1966 for (const MachineOperand &Op : MI.operands()) {
1967 if (!Op.isReg())
1968 continue;
1969
1970 // If the instruction does not read tied source, skip the operand.
1971 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1972 continue;
1973
1974 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
1975
1976 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1977 if (IsVGPR) {
1978 // Implicit VGPR defs and uses are never a part of the memory
1979 // instructions description and usually present to account for
1980 // super-register liveness.
1981 // TODO: Most of the other instructions also have implicit uses
1982 // for the liveness accounting only.
1983 if (Op.isImplicit() && MI.mayLoadOrStore())
1984 continue;
1985
1986 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1987 // previous write and this write are the same type of VMEM
1988 // instruction, in which case they are (in some architectures)
1989 // guaranteed to write their results in order anyway.
1990 // Additionally check instructions where Point Sample Acceleration
1991 // might be applied.
1992 if (Op.isUse() || !updateVMCntOnly(MI) ||
1993 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
1994 getVmemType(MI)) ||
1995 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
1996 !ST->hasVmemWriteVgprInOrder()) {
1997 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
1998 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
1999 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
2000 ScoreBrackets.clearVgprVmemTypes(Interval);
2001 }
2002
2003 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2004 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
2005 }
2006 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
2007 } else {
2008 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
2009 }
2010
2011 if (hasXcnt() && Op.isDef())
2012 ScoreBrackets.determineWait(X_CNT, Interval, Wait);
2013 }
2014 }
2015 }
2016
2017 // Ensure safety against exceptions from outstanding memory operations while
2018 // waiting for a barrier:
2019 //
2020 // * Some subtargets safely handle backing off the barrier in hardware
2021 // when an exception occurs.
2022 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2023 // there can be no outstanding memory operations during the wait.
2024 // * Subtargets with split barriers don't need to back off the barrier; it
2025 // is up to the trap handler to preserve the user barrier state correctly.
2026 //
2027 // In all other cases, ensure safety by ensuring that there are no outstanding
2028 // memory operations.
2029 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
2030 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
2031 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2032 }
2033
2034 // TODO: Remove this work-around, enable the assert for Bug 457939
2035 // after fixing the scheduler. Also, the Shader Compiler code is
2036 // independent of target.
2037 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
2038 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2039 Wait.DsCnt = 0;
2040 }
2041 }
2042
2043 // Verify that the wait is actually needed.
2044 ScoreBrackets.simplifyWaitcnt(Wait);
2045
2046 // When forcing emit, we need to skip terminators because that would break the
2047 // terminators of the MBB if we emit a waitcnt between terminators.
2048 if (ForceEmitZeroFlag && !MI.isTerminator())
2049 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2050
2051 if (ForceEmitWaitcnt[LOAD_CNT])
2052 Wait.LoadCnt = 0;
2053 if (ForceEmitWaitcnt[EXP_CNT])
2054 Wait.ExpCnt = 0;
2055 if (ForceEmitWaitcnt[DS_CNT])
2056 Wait.DsCnt = 0;
2057 if (ForceEmitWaitcnt[SAMPLE_CNT])
2058 Wait.SampleCnt = 0;
2059 if (ForceEmitWaitcnt[BVH_CNT])
2060 Wait.BvhCnt = 0;
2061 if (ForceEmitWaitcnt[KM_CNT])
2062 Wait.KmCnt = 0;
2063 if (ForceEmitWaitcnt[X_CNT])
2064 Wait.XCnt = 0;
2065
2066 if (FlushVmCnt) {
2067 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2068 Wait.LoadCnt = 0;
2069 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2070 Wait.SampleCnt = 0;
2071 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2072 Wait.BvhCnt = 0;
2073 }
2074
2075 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2076 Wait.LoadCnt = 0;
2077
2078 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2079 OldWaitcntInstr);
2080}
2081
2082bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2085 WaitcntBrackets &ScoreBrackets,
2086 MachineInstr *OldWaitcntInstr) {
2087 bool Modified = false;
2088
2089 if (OldWaitcntInstr)
2090 // Try to merge the required wait with preexisting waitcnt instructions.
2091 // Also erase redundant waitcnt.
2092 Modified =
2093 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2094
2095 // Any counts that could have been applied to any existing waitcnt
2096 // instructions will have been done so, now deal with any remaining.
2097 ScoreBrackets.applyWaitcnt(Wait);
2098
2099 // ExpCnt can be merged into VINTERP.
2100 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2102 MachineOperand *WaitExp =
2103 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2104 if (Wait.ExpCnt < WaitExp->getImm()) {
2105 WaitExp->setImm(Wait.ExpCnt);
2106 Modified = true;
2107 }
2108 Wait.ExpCnt = ~0u;
2109
2110 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2111 << "Update Instr: " << *It);
2112 }
2113
2114 // XCnt may be already consumed by a load wait.
2115 if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
2116 !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2117 Wait.XCnt = ~0u;
2118
2119 if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
2120 !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2121 Wait.XCnt = ~0u;
2122
2123 // Since the translation for VMEM addresses occur in-order, we can skip the
2124 // XCnt if the current instruction is of VMEM type and has a memory dependency
2125 // with another VMEM instruction in flight.
2126 if (Wait.XCnt != ~0u && isVmemAccess(*It))
2127 Wait.XCnt = ~0u;
2128
2129 if (WCG->createNewWaitcnt(Block, It, Wait))
2130 Modified = true;
2131
2132 return Modified;
2133}
2134
2135// This is a flat memory operation. Check to see if it has memory tokens other
2136// than LDS. Other address spaces supported by flat memory operations involve
2137// global memory.
2138bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
2139 assert(TII->isFLAT(MI));
2140
2141 // All flat instructions use the VMEM counter except prefetch.
2142 if (!TII->usesVM_CNT(MI))
2143 return false;
2144
2145 // If there are no memory operands then conservatively assume the flat
2146 // operation may access VMEM.
2147 if (MI.memoperands_empty())
2148 return true;
2149
2150 // See if any memory operand specifies an address space that involves VMEM.
2151 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
2152 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
2153 // (GDS) address space is not supported by flat operations. Therefore, simply
2154 // return true unless only the LDS address space is found.
2155 for (const MachineMemOperand *Memop : MI.memoperands()) {
2156 unsigned AS = Memop->getAddrSpace();
2158 if (AS != AMDGPUAS::LOCAL_ADDRESS)
2159 return true;
2160 }
2161
2162 return false;
2163}
2164
2165// This is a flat memory operation. Check to see if it has memory tokens for
2166// either LDS or FLAT.
2167bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2168 assert(TII->isFLAT(MI));
2169
2170 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2171 if (!TII->usesLGKM_CNT(MI))
2172 return false;
2173
2174 // If in tgsplit mode then there can be no use of LDS.
2175 if (ST->isTgSplitEnabled())
2176 return false;
2177
2178 // If there are no memory operands then conservatively assume the flat
2179 // operation may access LDS.
2180 if (MI.memoperands_empty())
2181 return true;
2182
2183 // See if any memory operand specifies an address space that involves LDS.
2184 for (const MachineMemOperand *Memop : MI.memoperands()) {
2185 unsigned AS = Memop->getAddrSpace();
2187 return true;
2188 }
2189
2190 return false;
2191}
2192
2193bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2194 return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
2195 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2196}
2197
2199 auto Opc = Inst.getOpcode();
2200 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2201 Opc == AMDGPU::GLOBAL_WBINV;
2202}
2203
2204// Return true if the next instruction is S_ENDPGM, following fallthrough
2205// blocks if necessary.
2206bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2207 MachineBasicBlock *Block) const {
2208 auto BlockEnd = Block->getParent()->end();
2209 auto BlockIter = Block->getIterator();
2210
2211 while (true) {
2212 if (It.isEnd()) {
2213 if (++BlockIter != BlockEnd) {
2214 It = BlockIter->instr_begin();
2215 continue;
2216 }
2217
2218 return false;
2219 }
2220
2221 if (!It->isMetaInstruction())
2222 break;
2223
2224 It++;
2225 }
2226
2227 assert(!It.isEnd());
2228
2229 return It->getOpcode() == AMDGPU::S_ENDPGM;
2230}
2231
2232// Add a wait after an instruction if architecture requirements mandate one.
2233bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2235 WaitcntBrackets &ScoreBrackets) {
2237 bool NeedsEndPGMCheck = false;
2238
2239 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2240 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2242
2243 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2244 Wait.DsCnt = 0;
2245 NeedsEndPGMCheck = true;
2246 }
2247
2248 ScoreBrackets.simplifyWaitcnt(Wait);
2249
2250 auto SuccessorIt = std::next(Inst.getIterator());
2251 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2252 /*OldWaitcntInstr=*/nullptr);
2253
2254 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2255 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2256 .addImm(0);
2257 }
2258
2259 return Result;
2260}
2261
2262void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2263 WaitcntBrackets *ScoreBrackets) {
2264 // Now look at the instruction opcode. If it is a memory access
2265 // instruction, update the upper-bound of the appropriate counter's
2266 // bracket and the destination operand scores.
2267 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2268
2269 bool IsVMEMAccess = false;
2270 bool IsSMEMAccess = false;
2271 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2272 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2273 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2274 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2275 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2276 ScoreBrackets->setPendingGDS();
2277 } else {
2278 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2279 }
2280 } else if (TII->isFLAT(Inst)) {
2281 if (isGFX12CacheInvOrWBInst(Inst)) {
2282 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2283 Inst);
2284 return;
2285 }
2286
2287 assert(Inst.mayLoadOrStore());
2288
2289 int FlatASCount = 0;
2290
2291 if (mayAccessVMEMThroughFlat(Inst)) {
2292 ++FlatASCount;
2293 IsVMEMAccess = true;
2294 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2295 Inst);
2296 }
2297
2298 if (mayAccessLDSThroughFlat(Inst)) {
2299 ++FlatASCount;
2300 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2301 }
2302
2303 // This is a flat memory operation that access both VMEM and LDS, so note it
2304 // - it will require that both the VM and LGKM be flushed to zero if it is
2305 // pending when a VM or LGKM dependency occurs.
2306 if (FlatASCount > 1)
2307 ScoreBrackets->setPendingFlat();
2308 } else if (SIInstrInfo::isVMEM(Inst) &&
2310 IsVMEMAccess = true;
2311 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2312 Inst);
2313
2314 if (ST->vmemWriteNeedsExpWaitcnt() &&
2315 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2316 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2317 }
2318 } else if (TII->isSMRD(Inst)) {
2319 IsSMEMAccess = true;
2320 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2321 } else if (Inst.isCall()) {
2322 if (callWaitsOnFunctionReturn(Inst)) {
2323 // Act as a wait on everything
2324 ScoreBrackets->applyWaitcnt(
2325 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2326 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2327 } else {
2328 // May need to way wait for anything.
2329 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2330 }
2331 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2332 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2333 } else if (TII->isVINTERP(Inst)) {
2334 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2335 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2336 } else if (SIInstrInfo::isEXP(Inst)) {
2337 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2339 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2340 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2341 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2342 else
2343 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2344 } else {
2345 switch (Inst.getOpcode()) {
2346 case AMDGPU::S_SENDMSG:
2347 case AMDGPU::S_SENDMSG_RTN_B32:
2348 case AMDGPU::S_SENDMSG_RTN_B64:
2349 case AMDGPU::S_SENDMSGHALT:
2350 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2351 break;
2352 case AMDGPU::S_MEMTIME:
2353 case AMDGPU::S_MEMREALTIME:
2354 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2355 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2356 case AMDGPU::S_BARRIER_LEAVE:
2357 case AMDGPU::S_GET_BARRIER_STATE_M0:
2358 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2359 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2360 break;
2361 }
2362 }
2363
2364 if (!hasXcnt())
2365 return;
2366
2367 if (IsVMEMAccess)
2368 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
2369
2370 if (IsSMEMAccess)
2371 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
2372}
2373
2374bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2375 unsigned OtherScore) {
2376 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2377 unsigned OtherShifted =
2378 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2379 Score = std::max(MyShifted, OtherShifted);
2380 return OtherShifted > MyShifted;
2381}
2382
2383/// Merge the pending events and associater score brackets of \p Other into
2384/// this brackets status.
2385///
2386/// Returns whether the merge resulted in a change that requires tighter waits
2387/// (i.e. the merged brackets strictly dominate the original brackets).
2388bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2389 bool StrictDom = false;
2390
2391 VgprUB = std::max(VgprUB, Other.VgprUB);
2392 SgprUB = std::max(SgprUB, Other.SgprUB);
2393
2394 for (auto T : inst_counter_types(Context->MaxCounter)) {
2395 // Merge event flags for this counter
2396 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2397 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2398 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2399 if (OtherEvents & ~OldEvents)
2400 StrictDom = true;
2401 PendingEvents |= OtherEvents;
2402
2403 // Merge scores for this counter
2404 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2405 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2406 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2407 if (NewUB < ScoreLBs[T])
2408 report_fatal_error("waitcnt score overflow");
2409
2410 MergeInfo M;
2411 M.OldLB = ScoreLBs[T];
2412 M.OtherLB = Other.ScoreLBs[T];
2413 M.MyShift = NewUB - ScoreUBs[T];
2414 M.OtherShift = NewUB - Other.ScoreUBs[T];
2415
2416 ScoreUBs[T] = NewUB;
2417
2418 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2419
2420 if (T == DS_CNT)
2421 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2422
2423 for (int J = 0; J <= VgprUB; J++)
2424 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2425
2426 if (isSmemCounter(T)) {
2427 unsigned Idx = getSgprScoresIdx(T);
2428 for (int J = 0; J <= SgprUB; J++)
2429 StrictDom |=
2430 mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
2431 }
2432 }
2433
2434 for (int J = 0; J <= VgprUB; J++) {
2435 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2436 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2437 VgprVmemTypes[J] = NewVmemTypes;
2438 }
2439
2440 return StrictDom;
2441}
2442
2443static bool isWaitInstr(MachineInstr &Inst) {
2444 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2445 return Opcode == AMDGPU::S_WAITCNT ||
2446 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2447 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2448 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2449 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2450 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2451 counterTypeForInstr(Opcode).has_value();
2452}
2453
2454// Generate s_waitcnt instructions where needed.
2455bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2457 WaitcntBrackets &ScoreBrackets) {
2458 bool Modified = false;
2459
2460 LLVM_DEBUG({
2461 dbgs() << "*** Begin Block: ";
2462 Block.printName(dbgs());
2463 ScoreBrackets.dump();
2464 });
2465
2466 // Track the correctness of vccz through this basic block. There are two
2467 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2468 // ST->partialVCCWritesUpdateVCCZ().
2469 bool VCCZCorrect = true;
2470 if (ST->hasReadVCCZBug()) {
2471 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2472 // to vcc and then issued an smem load.
2473 VCCZCorrect = false;
2474 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2475 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2476 // to vcc_lo or vcc_hi.
2477 VCCZCorrect = false;
2478 }
2479
2480 // Walk over the instructions.
2481 MachineInstr *OldWaitcntInstr = nullptr;
2482
2483 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2484 E = Block.instr_end();
2485 Iter != E;) {
2486 MachineInstr &Inst = *Iter;
2487 if (Inst.isMetaInstruction()) {
2488 ++Iter;
2489 continue;
2490 }
2491
2492 // Track pre-existing waitcnts that were added in earlier iterations or by
2493 // the memory legalizer.
2494 if (isWaitInstr(Inst)) {
2495 if (!OldWaitcntInstr)
2496 OldWaitcntInstr = &Inst;
2497 ++Iter;
2498 continue;
2499 }
2500
2501 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2502 isPreheaderToFlush(Block, ScoreBrackets);
2503
2504 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2505 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2506 FlushVmCnt);
2507 OldWaitcntInstr = nullptr;
2508
2509 // Restore vccz if it's not known to be correct already.
2510 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2511
2512 // Don't examine operands unless we need to track vccz correctness.
2513 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2514 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2515 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2516 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2517 if (!ST->partialVCCWritesUpdateVCCZ())
2518 VCCZCorrect = false;
2519 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2520 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2521 // vccz bit, so when we detect that an instruction may read from a
2522 // corrupt vccz bit, we need to:
2523 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2524 // operations to complete.
2525 // 2. Restore the correct value of vccz by writing the current value
2526 // of vcc back to vcc.
2527 if (ST->hasReadVCCZBug() &&
2528 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2529 // Writes to vcc while there's an outstanding smem read may get
2530 // clobbered as soon as any read completes.
2531 VCCZCorrect = false;
2532 } else {
2533 // Writes to vcc will fix any incorrect value in vccz.
2534 VCCZCorrect = true;
2535 }
2536 }
2537 }
2538
2539 if (TII->isSMRD(Inst)) {
2540 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2541 // No need to handle invariant loads when avoiding WAR conflicts, as
2542 // there cannot be a vector store to the same memory location.
2543 if (!Memop->isInvariant()) {
2544 const Value *Ptr = Memop->getValue();
2545 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2546 }
2547 }
2548 if (ST->hasReadVCCZBug()) {
2549 // This smem read could complete and clobber vccz at any time.
2550 VCCZCorrect = false;
2551 }
2552 }
2553
2554 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2555
2556 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2557
2558 LLVM_DEBUG({
2559 Inst.print(dbgs());
2560 ScoreBrackets.dump();
2561 });
2562
2563 // TODO: Remove this work-around after fixing the scheduler and enable the
2564 // assert above.
2565 if (RestoreVCCZ) {
2566 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2567 // bit is updated, so we can restore the bit by reading the value of
2568 // vcc and then writing it back to the register.
2569 BuildMI(Block, Inst, Inst.getDebugLoc(),
2570 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2571 TRI->getVCC())
2572 .addReg(TRI->getVCC());
2573 VCCZCorrect = true;
2574 Modified = true;
2575 }
2576
2577 ++Iter;
2578 }
2579
2580 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2581 // needed.
2583 if (Block.getFirstTerminator() == Block.end() &&
2584 isPreheaderToFlush(Block, ScoreBrackets)) {
2585 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2586 Wait.LoadCnt = 0;
2587 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2588 Wait.SampleCnt = 0;
2589 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2590 Wait.BvhCnt = 0;
2591 }
2592
2593 // Combine or remove any redundant waitcnts at the end of the block.
2594 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2595 OldWaitcntInstr);
2596
2597 LLVM_DEBUG({
2598 dbgs() << "*** End Block: ";
2599 Block.printName(dbgs());
2600 ScoreBrackets.dump();
2601 });
2602
2603 return Modified;
2604}
2605
2606// Return true if the given machine basic block is a preheader of a loop in
2607// which we want to flush the vmcnt counter, and false otherwise.
2608bool SIInsertWaitcnts::isPreheaderToFlush(
2609 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2610 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2611 if (!IsInserted)
2612 return Iterator->second;
2613
2615 if (!Succ)
2616 return false;
2617
2618 MachineLoop *Loop = MLI->getLoopFor(Succ);
2619 if (!Loop)
2620 return false;
2621
2622 if (Loop->getLoopPreheader() == &MBB &&
2623 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2624 Iterator->second = true;
2625 return true;
2626 }
2627
2628 return false;
2629}
2630
2631bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2633 return mayAccessVMEMThroughFlat(MI);
2634 return SIInstrInfo::isVMEM(MI);
2635}
2636
2637// Return true if it is better to flush the vmcnt counter in the preheader of
2638// the given loop. We currently decide to flush in two situations:
2639// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2640// vgpr containing a value that is loaded outside of the loop. (Only on
2641// targets with no vscnt counter).
2642// 2. The loop contains vmem load(s), but the loaded values are not used in the
2643// loop, and at least one use of a vgpr containing a value that is loaded
2644// outside of the loop.
2645bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2646 const WaitcntBrackets &Brackets) {
2647 bool HasVMemLoad = false;
2648 bool HasVMemStore = false;
2649 bool UsesVgprLoadedOutside = false;
2650 DenseSet<Register> VgprUse;
2651 DenseSet<Register> VgprDef;
2652
2653 for (MachineBasicBlock *MBB : ML->blocks()) {
2654 for (MachineInstr &MI : *MBB) {
2655 if (isVMEMOrFlatVMEM(MI)) {
2656 if (MI.mayLoad())
2657 HasVMemLoad = true;
2658 if (MI.mayStore())
2659 HasVMemStore = true;
2660 }
2661 for (const MachineOperand &Op : MI.all_uses()) {
2662 if (!TRI->isVectorRegister(*MRI, Op.getReg()))
2663 continue;
2664 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2665 // Vgpr use
2666 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2667 // If we find a register that is loaded inside the loop, 1. and 2.
2668 // are invalidated and we can exit.
2669 if (VgprDef.contains(RegNo))
2670 return false;
2671 VgprUse.insert(RegNo);
2672 // If at least one of Op's registers is in the score brackets, the
2673 // value is likely loaded outside of the loop.
2674 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2675 Brackets.getScoreLB(LOAD_CNT) ||
2676 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2677 Brackets.getScoreLB(SAMPLE_CNT) ||
2678 Brackets.getRegScore(RegNo, BVH_CNT) >
2679 Brackets.getScoreLB(BVH_CNT)) {
2680 UsesVgprLoadedOutside = true;
2681 break;
2682 }
2683 }
2684 }
2685
2686 // VMem load vgpr def
2687 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2688 for (const MachineOperand &Op : MI.all_defs()) {
2689 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2690 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2691 // If we find a register that is loaded inside the loop, 1. and 2.
2692 // are invalidated and we can exit.
2693 if (VgprUse.contains(RegNo))
2694 return false;
2695 VgprDef.insert(RegNo);
2696 }
2697 }
2698 }
2699 }
2700 }
2701 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2702 return true;
2703 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2704}
2705
2706bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2707 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2708 auto *PDT =
2709 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2710 AliasAnalysis *AA = nullptr;
2711 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2712 AA = &AAR->getAAResults();
2713
2714 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2715}
2716
2720 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2721 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2723 .getManager()
2724 .getCachedResult<AAManager>(MF.getFunction());
2725
2726 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2727 return PreservedAnalyses::all();
2728
2731 .preserve<AAManager>();
2732}
2733
2734bool SIInsertWaitcnts::run(MachineFunction &MF) {
2735 ST = &MF.getSubtarget<GCNSubtarget>();
2736 TII = ST->getInstrInfo();
2737 TRI = &TII->getRegisterInfo();
2738 MRI = &MF.getRegInfo();
2740
2742
2743 if (ST->hasExtendedWaitCounts()) {
2744 MaxCounter = NUM_EXTENDED_INST_CNTS;
2745 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2746 WCG = &WCGGFX12Plus;
2747 } else {
2748 MaxCounter = NUM_NORMAL_INST_CNTS;
2749 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2750 WCG = &WCGPreGFX12;
2751 }
2752
2753 for (auto T : inst_counter_types())
2754 ForceEmitWaitcnt[T] = false;
2755
2756 WaitEventMaskForInst = WCG->getWaitEventMask();
2757
2758 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2759
2760 if (ST->hasExtendedWaitCounts()) {
2761 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2762 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2763 } else {
2764 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2765 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2766 }
2767 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2768 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2769 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2770 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2771 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2772 Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
2773
2774 [[maybe_unused]] unsigned NumVGPRsMax =
2775 ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
2776 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2777 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2778 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2779
2780 BlockInfos.clear();
2781 bool Modified = false;
2782
2783 MachineBasicBlock &EntryBB = MF.front();
2785
2786 if (!MFI->isEntryFunction()) {
2787 // Wait for any outstanding memory operations that the input registers may
2788 // depend on. We can't track them and it's better to do the wait after the
2789 // costly call sequence.
2790
2791 // TODO: Could insert earlier and schedule more liberally with operations
2792 // that only use caller preserved registers.
2793 for (MachineBasicBlock::iterator E = EntryBB.end();
2794 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2795 ;
2796
2797 if (ST->hasExtendedWaitCounts()) {
2798 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2799 .addImm(0);
2800 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2801 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2802 continue;
2803
2804 if (!ST->hasImageInsts() &&
2805 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2806 continue;
2807
2808 BuildMI(EntryBB, I, DebugLoc(),
2809 TII->get(instrsForExtendedCounterTypes[CT]))
2810 .addImm(0);
2811 }
2812 } else {
2813 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2814 }
2815
2816 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
2817 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2818 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2819
2820 Modified = true;
2821 }
2822
2823 // Keep iterating over the blocks in reverse post order, inserting and
2824 // updating s_waitcnt where needed, until a fix point is reached.
2826 BlockInfos.try_emplace(MBB);
2827
2828 std::unique_ptr<WaitcntBrackets> Brackets;
2829 bool Repeat;
2830 do {
2831 Repeat = false;
2832
2833 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2834 ++BII) {
2835 MachineBasicBlock *MBB = BII->first;
2836 BlockInfo &BI = BII->second;
2837 if (!BI.Dirty)
2838 continue;
2839
2840 if (BI.Incoming) {
2841 if (!Brackets)
2842 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2843 else
2844 *Brackets = *BI.Incoming;
2845 } else {
2846 if (!Brackets) {
2847 Brackets = std::make_unique<WaitcntBrackets>(this);
2848 } else {
2849 // Reinitialize in-place. N.B. do not do this by assigning from a
2850 // temporary because the WaitcntBrackets class is large and it could
2851 // cause this function to use an unreasonable amount of stack space.
2852 Brackets->~WaitcntBrackets();
2853 new (Brackets.get()) WaitcntBrackets(this);
2854 }
2855 }
2856
2857 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2858 BI.Dirty = false;
2859
2860 if (Brackets->hasPendingEvent()) {
2861 BlockInfo *MoveBracketsToSucc = nullptr;
2862 for (MachineBasicBlock *Succ : MBB->successors()) {
2863 auto *SuccBII = BlockInfos.find(Succ);
2864 BlockInfo &SuccBI = SuccBII->second;
2865 if (!SuccBI.Incoming) {
2866 SuccBI.Dirty = true;
2867 if (SuccBII <= BII) {
2868 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2869 Repeat = true;
2870 }
2871 if (!MoveBracketsToSucc) {
2872 MoveBracketsToSucc = &SuccBI;
2873 } else {
2874 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2875 }
2876 } else if (SuccBI.Incoming->merge(*Brackets)) {
2877 SuccBI.Dirty = true;
2878 if (SuccBII <= BII) {
2879 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2880 Repeat = true;
2881 }
2882 }
2883 }
2884 if (MoveBracketsToSucc)
2885 MoveBracketsToSucc->Incoming = std::move(Brackets);
2886 }
2887 }
2888 } while (Repeat);
2889
2890 if (ST->hasScalarStores()) {
2892 bool HaveScalarStores = false;
2893
2894 for (MachineBasicBlock &MBB : MF) {
2895 for (MachineInstr &MI : MBB) {
2896 if (!HaveScalarStores && TII->isScalarStore(MI))
2897 HaveScalarStores = true;
2898
2899 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2900 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2901 EndPgmBlocks.push_back(&MBB);
2902 }
2903 }
2904
2905 if (HaveScalarStores) {
2906 // If scalar writes are used, the cache must be flushed or else the next
2907 // wave to reuse the same scratch memory can be clobbered.
2908 //
2909 // Insert s_dcache_wb at wave termination points if there were any scalar
2910 // stores, and only if the cache hasn't already been flushed. This could
2911 // be improved by looking across blocks for flushes in postdominating
2912 // blocks from the stores but an explicitly requested flush is probably
2913 // very rare.
2914 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2915 bool SeenDCacheWB = false;
2916
2917 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2918 I != E; ++I) {
2919 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2920 SeenDCacheWB = true;
2921 else if (TII->isScalarStore(*I))
2922 SeenDCacheWB = false;
2923
2924 // FIXME: It would be better to insert this before a waitcnt if any.
2925 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2926 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2927 !SeenDCacheWB) {
2928 Modified = true;
2929 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2930 }
2931 }
2932 }
2933 }
2934 }
2935
2936 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2937 // This is done in different ways depending on how the VGPRs were allocated
2938 // (i.e. whether we're in dynamic VGPR mode or not).
2939 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2940 // waveslot limited kernel runs slower with the deallocation.
2941 if (MFI->isDynamicVGPREnabled()) {
2942 for (MachineInstr *MI : ReleaseVGPRInsts) {
2943 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2944 TII->get(AMDGPU::S_ALLOC_VGPR))
2945 .addImm(0);
2946 Modified = true;
2947 }
2948 } else {
2949 if (!ReleaseVGPRInsts.empty() &&
2950 (MF.getFrameInfo().hasCalls() ||
2951 ST->getOccupancyWithNumVGPRs(
2952 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
2953 /*IsDynamicVGPR=*/false) <
2955 for (MachineInstr *MI : ReleaseVGPRInsts) {
2956 if (ST->requiresNopBeforeDeallocVGPRs()) {
2957 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2958 TII->get(AMDGPU::S_NOP))
2959 .addImm(0);
2960 }
2961 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2962 TII->get(AMDGPU::S_SENDMSG))
2964 Modified = true;
2965 }
2966 }
2967 }
2968 ReleaseVGPRInsts.clear();
2969 PreheadersToFlush.clear();
2970 SLoadAddresses.clear();
2971
2972 return Modified;
2973}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:194
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define DEBUG_TYPE
#define AMDGPU_EVENT_ENUM(Name)
SI Insert Waitcnts
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition: blake3_impl.h:83
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:97
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:88
A debug info location.
Definition: DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
bool erase(const KeyT &Val)
Definition: DenseMap.h:319
iterator end()
Definition: DenseMap.h:87
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
mop_range defs()
Returns all explicit operands that are register definitions.
Definition: MachineInstr.h:724
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:754
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:948
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:780
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:511
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
Definition: MachineInstr.h:764
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Definition: MachineInstr.h:934
Analysis pass that exposes the MachineLoopInfo for a machine function.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:67
iterator find(const KeyT &Key)
Definition: MapVector.h:141
iterator begin()
Definition: MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:107
void clear()
Definition: MapVector.h:84
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:701
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:743
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:891
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:602
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:660
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:725
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:1030
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:899
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:717
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
LLVM Value Representation.
Definition: Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
self_iterator getIterator()
Definition: ilist_node.h:134
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ None
Definition: CodeGenData.h:107
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:132
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100