LLVM 22.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
30
31using namespace llvm;
32using namespace llvm::AMDGPU;
33
34#define DEBUG_TYPE "si-memory-legalizer"
35#define PASS_NAME "SI Memory Legalizer"
36
38 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
39 cl::desc("Use this to skip inserting cache invalidating instructions."));
40
41namespace {
42
44
45/// Memory operation flags. Can be ORed together.
46enum class SIMemOp {
47 NONE = 0u,
48 LOAD = 1u << 0,
49 STORE = 1u << 1,
50 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
51};
52
53/// Position to insert a new instruction relative to an existing
54/// instruction.
55enum class Position {
56 BEFORE,
57 AFTER
58};
59
60/// The atomic synchronization scopes supported by the AMDGPU target.
61enum class SIAtomicScope {
62 NONE,
63 SINGLETHREAD,
64 WAVEFRONT,
65 WORKGROUP,
66 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
67 AGENT,
68 SYSTEM
69};
70
71/// The distinct address spaces supported by the AMDGPU target for
72/// atomic memory operation. Can be ORed together.
73enum class SIAtomicAddrSpace {
74 NONE = 0u,
75 GLOBAL = 1u << 0,
76 LDS = 1u << 1,
77 SCRATCH = 1u << 2,
78 GDS = 1u << 3,
79 OTHER = 1u << 4,
80
81 /// The address spaces that can be accessed by a FLAT instruction.
82 FLAT = GLOBAL | LDS | SCRATCH,
83
84 /// The address spaces that support atomic instructions.
85 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
86
87 /// All address spaces.
88 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
89
90 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
91};
92
93class SIMemOpInfo final {
94private:
95
96 friend class SIMemOpAccess;
97
98 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
99 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
100 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
101 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
102 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
103 bool IsCrossAddressSpaceOrdering = false;
104 bool IsVolatile = false;
105 bool IsNonTemporal = false;
106 bool IsLastUse = false;
107 bool IsCooperative = false;
108
109 SIMemOpInfo(
110 const GCNSubtarget &ST,
111 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
112 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
113 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
114 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
115 bool IsCrossAddressSpaceOrdering = true,
116 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
117 bool IsVolatile = false, bool IsNonTemporal = false,
118 bool IsLastUse = false, bool IsCooperative = false)
119 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
120 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
121 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
122 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
123 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
124
125 if (Ordering == AtomicOrdering::NotAtomic) {
126 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
127 assert(Scope == SIAtomicScope::NONE &&
128 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
129 !IsCrossAddressSpaceOrdering &&
130 FailureOrdering == AtomicOrdering::NotAtomic);
131 return;
132 }
133
134 assert(Scope != SIAtomicScope::NONE &&
135 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
136 SIAtomicAddrSpace::NONE &&
137 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE);
139
140 // There is also no cross address space ordering if the ordering
141 // address space is the same as the instruction address space and
142 // only contains a single address space.
143 if ((OrderingAddrSpace == InstrAddrSpace) &&
144 isPowerOf2_32(uint32_t(InstrAddrSpace)))
145 this->IsCrossAddressSpaceOrdering = false;
146
147 // Limit the scope to the maximum supported by the instruction's address
148 // spaces.
149 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
150 SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
152 } else if ((InstrAddrSpace &
153 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
154 SIAtomicAddrSpace::NONE) {
155 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
156 } else if ((InstrAddrSpace &
157 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
158 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
159 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
160 }
161
162 // On targets that have no concept of a workgroup cluster, use
163 // AGENT scope as a conservatively correct alternative.
164 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
165 this->Scope = SIAtomicScope::AGENT;
166 }
167
168public:
169 /// \returns Atomic synchronization scope of the machine instruction used to
170 /// create this SIMemOpInfo.
171 SIAtomicScope getScope() const {
172 return Scope;
173 }
174
175 /// \returns Ordering constraint of the machine instruction used to
176 /// create this SIMemOpInfo.
177 AtomicOrdering getOrdering() const {
178 return Ordering;
179 }
180
181 /// \returns Failure ordering constraint of the machine instruction used to
182 /// create this SIMemOpInfo.
183 AtomicOrdering getFailureOrdering() const {
184 return FailureOrdering;
185 }
186
187 /// \returns The address spaces be accessed by the machine
188 /// instruction used to create this SIMemOpInfo.
189 SIAtomicAddrSpace getInstrAddrSpace() const {
190 return InstrAddrSpace;
191 }
192
193 /// \returns The address spaces that must be ordered by the machine
194 /// instruction used to create this SIMemOpInfo.
195 SIAtomicAddrSpace getOrderingAddrSpace() const {
196 return OrderingAddrSpace;
197 }
198
199 /// \returns Return true iff memory ordering of operations on
200 /// different address spaces is required.
201 bool getIsCrossAddressSpaceOrdering() const {
202 return IsCrossAddressSpaceOrdering;
203 }
204
205 /// \returns True if memory access of the machine instruction used to
206 /// create this SIMemOpInfo is volatile, false otherwise.
207 bool isVolatile() const {
208 return IsVolatile;
209 }
210
211 /// \returns True if memory access of the machine instruction used to
212 /// create this SIMemOpInfo is nontemporal, false otherwise.
213 bool isNonTemporal() const {
214 return IsNonTemporal;
215 }
216
217 /// \returns True if memory access of the machine instruction used to
218 /// create this SIMemOpInfo is last use, false otherwise.
219 bool isLastUse() const { return IsLastUse; }
220
221 /// \returns True if this is a cooperative load or store atomic.
222 bool isCooperative() const { return IsCooperative; }
223
224 /// \returns True if ordering constraint of the machine instruction used to
225 /// create this SIMemOpInfo is unordered or higher, false otherwise.
226 bool isAtomic() const {
227 return Ordering != AtomicOrdering::NotAtomic;
228 }
229
230};
231
232class SIMemOpAccess final {
233private:
234 const AMDGPUMachineModuleInfo *MMI = nullptr;
235 const GCNSubtarget &ST;
236
237 /// Reports unsupported message \p Msg for \p MI to LLVM context.
238 void reportUnsupported(const MachineBasicBlock::iterator &MI,
239 const char *Msg) const;
240
241 /// Inspects the target synchronization scope \p SSID and determines
242 /// the SI atomic scope it corresponds to, the address spaces it
243 /// covers, and whether the memory ordering applies between address
244 /// spaces.
245 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
246 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
247
248 /// \return Return a bit set of the address spaces accessed by \p AS.
249 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
250
251 /// \returns Info constructed from \p MI, which has at least machine memory
252 /// operand.
253 std::optional<SIMemOpInfo>
254 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
255
256public:
257 /// Construct class to support accessing the machine memory operands
258 /// of instructions in the machine function \p MF.
259 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
260
261 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
262 std::optional<SIMemOpInfo>
264
265 /// \returns Store info if \p MI is a store operation, "std::nullopt"
266 /// otherwise.
267 std::optional<SIMemOpInfo>
268 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
269
270 /// \returns Atomic fence info if \p MI is an atomic fence operation,
271 /// "std::nullopt" otherwise.
272 std::optional<SIMemOpInfo>
273 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
274
275 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
276 /// rmw operation, "std::nullopt" otherwise.
277 std::optional<SIMemOpInfo>
278 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
279};
280
281class SICacheControl {
282protected:
283
284 /// AMDGPU subtarget info.
285 const GCNSubtarget &ST;
286
287 /// Instruction info.
288 const SIInstrInfo *TII = nullptr;
289
290 IsaVersion IV;
291
292 /// Whether to insert cache invalidating instructions.
293 bool InsertCacheInv;
294
295 SICacheControl(const GCNSubtarget &ST);
296
297 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
298 /// \returns Returns true if \p MI is modified, false otherwise.
299 bool enableNamedBit(const MachineBasicBlock::iterator MI,
300 AMDGPU::CPol::CPol Bit) const;
301
302public:
303
304 /// Create a cache control for the subtarget \p ST.
305 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
306
307 /// Update \p MI memory load instruction to bypass any caches up to
308 /// the \p Scope memory scope for address spaces \p
309 /// AddrSpace. Return true iff the instruction was modified.
310 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
311 SIAtomicScope Scope,
312 SIAtomicAddrSpace AddrSpace) const = 0;
313
314 /// Update \p MI memory store instruction to bypass any caches up to
315 /// the \p Scope memory scope for address spaces \p
316 /// AddrSpace. Return true iff the instruction was modified.
317 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
318 SIAtomicScope Scope,
319 SIAtomicAddrSpace AddrSpace) const = 0;
320
321 /// Update \p MI memory read-modify-write instruction to bypass any caches up
322 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
323 /// iff the instruction was modified.
324 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
325 SIAtomicScope Scope,
326 SIAtomicAddrSpace AddrSpace) const = 0;
327
328 /// Update \p MI memory instruction of kind \p Op associated with address
329 /// spaces \p AddrSpace to indicate it is volatile and/or
330 /// nontemporal/last-use. Return true iff the instruction was modified.
331 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
332 SIAtomicAddrSpace AddrSpace,
333 SIMemOp Op, bool IsVolatile,
334 bool IsNonTemporal,
335 bool IsLastUse = false) const = 0;
336
337 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
338 return false;
339 };
340
341 /// Handle cooperative load/store atomics.
342 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
344 "cooperative atomics are not available on this architecture");
345 }
346
347 /// Inserts any necessary instructions at position \p Pos relative
348 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
349 /// \p Op associated with address spaces \p AddrSpace have completed. Used
350 /// between memory instructions to enforce the order they become visible as
351 /// observed by other memory instructions executing in memory scope \p Scope.
352 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
353 /// address spaces. Returns true iff any instructions inserted.
354 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
355 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
356 bool IsCrossAddrSpaceOrdering, Position Pos,
357 AtomicOrdering Order) const = 0;
358
359 /// Inserts any necessary instructions at position \p Pos relative to
360 /// instruction \p MI to ensure any subsequent memory instructions of this
361 /// thread with address spaces \p AddrSpace will observe the previous memory
362 /// operations by any thread for memory scopes up to memory scope \p Scope .
363 /// Returns true iff any instructions inserted.
364 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
365 SIAtomicScope Scope,
366 SIAtomicAddrSpace AddrSpace,
367 Position Pos) const = 0;
368
369 /// Inserts any necessary instructions at position \p Pos relative to
370 /// instruction \p MI to ensure previous memory instructions by this thread
371 /// with address spaces \p AddrSpace have completed and can be observed by
372 /// subsequent memory instructions by any thread executing in memory scope \p
373 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
374 /// between address spaces. Returns true iff any instructions inserted.
375 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
376 SIAtomicScope Scope,
377 SIAtomicAddrSpace AddrSpace,
378 bool IsCrossAddrSpaceOrdering,
379 Position Pos) const = 0;
380
381 /// Inserts any necessary instructions before the barrier start instruction
382 /// \p MI in order to support pairing of barriers and fences.
383 virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
384 return false;
385 };
386
387 /// Virtual destructor to allow derivations to be deleted.
388 virtual ~SICacheControl() = default;
389};
390
391class SIGfx6CacheControl : public SICacheControl {
392protected:
393
394 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
395 /// is modified, false otherwise.
396 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
397 return enableNamedBit(MI, AMDGPU::CPol::GLC);
398 }
399
400 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
401 /// is modified, false otherwise.
402 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
403 return enableNamedBit(MI, AMDGPU::CPol::SLC);
404 }
405
406public:
407
408 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
409
410 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
411 SIAtomicScope Scope,
412 SIAtomicAddrSpace AddrSpace) const override;
413
414 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
415 SIAtomicScope Scope,
416 SIAtomicAddrSpace AddrSpace) const override;
417
418 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
419 SIAtomicScope Scope,
420 SIAtomicAddrSpace AddrSpace) const override;
421
422 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
423 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
424 bool IsVolatile, bool IsNonTemporal,
425 bool IsLastUse) const override;
426
427 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
428 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
429 bool IsCrossAddrSpaceOrdering, Position Pos,
430 AtomicOrdering Order) const override;
431
432 bool insertAcquire(MachineBasicBlock::iterator &MI,
433 SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace,
435 Position Pos) const override;
436
437 bool insertRelease(MachineBasicBlock::iterator &MI,
438 SIAtomicScope Scope,
439 SIAtomicAddrSpace AddrSpace,
440 bool IsCrossAddrSpaceOrdering,
441 Position Pos) const override;
442};
443
444class SIGfx7CacheControl : public SIGfx6CacheControl {
445public:
446
447 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
448
449 bool insertAcquire(MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace,
452 Position Pos) const override;
453
454};
455
456class SIGfx90ACacheControl : public SIGfx7CacheControl {
457public:
458
459 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
460
461 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace) const override;
464
465 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
466 SIAtomicScope Scope,
467 SIAtomicAddrSpace AddrSpace) const override;
468
469 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
470 SIAtomicScope Scope,
471 SIAtomicAddrSpace AddrSpace) const override;
472
473 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
474 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
475 bool IsVolatile, bool IsNonTemporal,
476 bool IsLastUse) const override;
477
478 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
479 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
480 bool IsCrossAddrSpaceOrdering, Position Pos,
481 AtomicOrdering Order) const override;
482
483 bool insertAcquire(MachineBasicBlock::iterator &MI,
484 SIAtomicScope Scope,
485 SIAtomicAddrSpace AddrSpace,
486 Position Pos) const override;
487
488 bool insertRelease(MachineBasicBlock::iterator &MI,
489 SIAtomicScope Scope,
490 SIAtomicAddrSpace AddrSpace,
491 bool IsCrossAddrSpaceOrdering,
492 Position Pos) const override;
493};
494
495class SIGfx940CacheControl : public SIGfx90ACacheControl {
496protected:
497
498 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
499 /// is modified, false otherwise.
500 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
501 return enableNamedBit(MI, AMDGPU::CPol::SC0);
502 }
503
504 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
505 /// is modified, false otherwise.
506 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
507 return enableNamedBit(MI, AMDGPU::CPol::SC1);
508 }
509
510 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
511 /// is modified, false otherwise.
512 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
513 return enableNamedBit(MI, AMDGPU::CPol::NT);
514 }
515
516public:
517 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
518
519 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
520 SIAtomicScope Scope,
521 SIAtomicAddrSpace AddrSpace) const override;
522
523 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
524 SIAtomicScope Scope,
525 SIAtomicAddrSpace AddrSpace) const override;
526
527 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
528 SIAtomicScope Scope,
529 SIAtomicAddrSpace AddrSpace) const override;
530
531 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
532 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
533 bool IsVolatile, bool IsNonTemporal,
534 bool IsLastUse) const override;
535
536 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
537 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
538
539 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
540 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
541 Position Pos) const override;
542};
543
544class SIGfx10CacheControl : public SIGfx7CacheControl {
545protected:
546
547 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
548 /// is modified, false otherwise.
549 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
550 return enableNamedBit(MI, AMDGPU::CPol::DLC);
551 }
552
553public:
554
555 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
556
557 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
558 SIAtomicScope Scope,
559 SIAtomicAddrSpace AddrSpace) const override;
560
561 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
562 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
563 bool IsVolatile, bool IsNonTemporal,
564 bool IsLastUse) const override;
565
566 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
567 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
568 bool IsCrossAddrSpaceOrdering, Position Pos,
569 AtomicOrdering Order) const override;
570
571 bool insertAcquire(MachineBasicBlock::iterator &MI,
572 SIAtomicScope Scope,
573 SIAtomicAddrSpace AddrSpace,
574 Position Pos) const override;
575
576 bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
577};
578
579class SIGfx11CacheControl : public SIGfx10CacheControl {
580public:
581 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
582
583 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
584 SIAtomicScope Scope,
585 SIAtomicAddrSpace AddrSpace) const override;
586
587 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
588 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
589 bool IsVolatile, bool IsNonTemporal,
590 bool IsLastUse) const override;
591};
592
593class SIGfx12CacheControl : public SIGfx11CacheControl {
594protected:
595 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
596 // \returns Returns true if \p MI is modified, false otherwise.
597 bool setTH(const MachineBasicBlock::iterator MI,
599 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
600 // MI. \returns Returns true if \p MI is modified, false otherwise.
601 bool setScope(const MachineBasicBlock::iterator MI,
603
604 // Stores with system scope (SCOPE_SYS) need to wait for:
605 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
606 // - non-returning-atomics - wait for STORECNT==0
607 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
608 // since it does not distinguish atomics-with-return from regular stores.
609 // There is no need to wait if memory is cached (mtype != UC).
610 bool
611 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
612
613 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
614 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
615
616public:
617 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
618 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
619 // the behavior is the same if assuming GFX12.0 in CU mode.
620 assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
621 }
622
623 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
624 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
625 bool IsCrossAddrSpaceOrdering, Position Pos,
626 AtomicOrdering Order) const override;
627
628 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
629 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
630
631 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
632 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
633 bool IsVolatile, bool IsNonTemporal,
634 bool IsLastUse) const override;
635
636 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
637
638 virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
639
640 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
641 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
642 Position Pos) const override;
643
644 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
645 SIAtomicScope Scope,
646 SIAtomicAddrSpace AddrSpace) const override {
647 return setAtomicScope(MI, Scope, AddrSpace);
648 }
649
650 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
651 SIAtomicScope Scope,
652 SIAtomicAddrSpace AddrSpace) const override {
653 return setAtomicScope(MI, Scope, AddrSpace);
654 }
655
656 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
657 SIAtomicScope Scope,
658 SIAtomicAddrSpace AddrSpace) const override {
659 return setAtomicScope(MI, Scope, AddrSpace);
660 }
661};
662
663class SIMemoryLegalizer final {
664private:
665 const MachineModuleInfo &MMI;
666 /// Cache Control.
667 std::unique_ptr<SICacheControl> CC = nullptr;
668
669 /// List of atomic pseudo instructions.
670 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
671
672 /// Return true iff instruction \p MI is a atomic instruction that
673 /// returns a result.
674 bool isAtomicRet(const MachineInstr &MI) const {
676 }
677
678 /// Removes all processed atomic pseudo instructions from the current
679 /// function. Returns true if current function is modified, false otherwise.
680 bool removeAtomicPseudoMIs();
681
682 /// Expands load operation \p MI. Returns true if instructions are
683 /// added/deleted or \p MI is modified, false otherwise.
684 bool expandLoad(const SIMemOpInfo &MOI,
686 /// Expands store operation \p MI. Returns true if instructions are
687 /// added/deleted or \p MI is modified, false otherwise.
688 bool expandStore(const SIMemOpInfo &MOI,
690 /// Expands atomic fence operation \p MI. Returns true if
691 /// instructions are added/deleted or \p MI is modified, false otherwise.
692 bool expandAtomicFence(const SIMemOpInfo &MOI,
694 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
695 /// instructions are added/deleted or \p MI is modified, false otherwise.
696 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
698
699public:
700 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
701 bool run(MachineFunction &MF);
702};
703
704class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
705public:
706 static char ID;
707
708 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
709
710 void getAnalysisUsage(AnalysisUsage &AU) const override {
711 AU.setPreservesCFG();
713 }
714
715 StringRef getPassName() const override {
716 return PASS_NAME;
717 }
718
719 bool runOnMachineFunction(MachineFunction &MF) override;
720};
721
722static const StringMap<SIAtomicAddrSpace> ASNames = {{
723 {"global", SIAtomicAddrSpace::GLOBAL},
724 {"local", SIAtomicAddrSpace::LDS},
725}};
726
727void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
728 const MachineFunction *MF = MI.getMF();
729 const Function &Fn = MF->getFunction();
731 raw_svector_ostream OS(Str);
732 OS << "unknown address space '" << AS << "'; expected one of ";
734 for (const auto &[Name, Val] : ASNames)
735 OS << LS << '\'' << Name << '\'';
736 Fn.getContext().diagnose(
737 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
738}
739
740/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
741/// If this tag isn't present, or if it has no meaningful values, returns
742/// \p none, otherwise returns the address spaces specified by the MD.
743static std::optional<SIAtomicAddrSpace>
744getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
745 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
746
747 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
748 if (!MMRA)
749 return std::nullopt;
750
751 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
752 for (const auto &[Prefix, Suffix] : MMRA) {
753 if (Prefix != FenceASPrefix)
754 continue;
755
756 if (auto It = ASNames.find(Suffix); It != ASNames.end())
757 Result |= It->second;
758 else
759 diagnoseUnknownMMRAASName(MI, Suffix);
760 }
761
762 if (Result == SIAtomicAddrSpace::NONE)
763 return std::nullopt;
764
765 return Result;
766}
767
768} // end anonymous namespace
769
770void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
771 const char *Msg) const {
772 const Function &Func = MI->getParent()->getParent()->getFunction();
773 Func.getContext().diagnose(
774 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
775}
776
777std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
778SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
779 SIAtomicAddrSpace InstrAddrSpace) const {
780 if (SSID == SyncScope::System)
781 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
782 if (SSID == MMI->getAgentSSID())
783 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
784 if (SSID == MMI->getClusterSSID())
785 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
786 if (SSID == MMI->getWorkgroupSSID())
787 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
788 true);
789 if (SSID == MMI->getWavefrontSSID())
790 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
791 true);
792 if (SSID == SyncScope::SingleThread)
793 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
794 true);
795 if (SSID == MMI->getSystemOneAddressSpaceSSID())
796 return std::tuple(SIAtomicScope::SYSTEM,
797 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
798 if (SSID == MMI->getAgentOneAddressSpaceSSID())
799 return std::tuple(SIAtomicScope::AGENT,
800 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
801 if (SSID == MMI->getClusterOneAddressSpaceSSID())
802 return std::tuple(SIAtomicScope::CLUSTER,
803 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
804 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
805 return std::tuple(SIAtomicScope::WORKGROUP,
806 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
807 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
808 return std::tuple(SIAtomicScope::WAVEFRONT,
809 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
810 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
811 return std::tuple(SIAtomicScope::SINGLETHREAD,
812 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
813 return std::nullopt;
814}
815
816SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
817 if (AS == AMDGPUAS::FLAT_ADDRESS)
818 return SIAtomicAddrSpace::FLAT;
819 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
820 return SIAtomicAddrSpace::GLOBAL;
821 if (AS == AMDGPUAS::LOCAL_ADDRESS)
822 return SIAtomicAddrSpace::LDS;
824 return SIAtomicAddrSpace::SCRATCH;
825 if (AS == AMDGPUAS::REGION_ADDRESS)
826 return SIAtomicAddrSpace::GDS;
827
828 return SIAtomicAddrSpace::OTHER;
829}
830
831SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
832 const GCNSubtarget &ST)
833 : MMI(&MMI_), ST(ST) {}
834
835std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
836 const MachineBasicBlock::iterator &MI) const {
837 assert(MI->getNumMemOperands() > 0);
838
840 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
841 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
842 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
843 bool IsNonTemporal = true;
844 bool IsVolatile = false;
845 bool IsLastUse = false;
846 bool IsCooperative = false;
847
848 // Validator should check whether or not MMOs cover the entire set of
849 // locations accessed by the memory instruction.
850 for (const auto &MMO : MI->memoperands()) {
851 IsNonTemporal &= MMO->isNonTemporal();
852 IsVolatile |= MMO->isVolatile();
853 IsLastUse |= MMO->getFlags() & MOLastUse;
854 IsCooperative |= MMO->getFlags() & MOCooperative;
855 InstrAddrSpace |=
856 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
857 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
858 if (OpOrdering != AtomicOrdering::NotAtomic) {
859 const auto &IsSyncScopeInclusion =
860 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
861 if (!IsSyncScopeInclusion) {
862 reportUnsupported(MI,
863 "Unsupported non-inclusive atomic synchronization scope");
864 return std::nullopt;
865 }
866
867 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
868 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
869 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
870 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
871 FailureOrdering =
872 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
873 }
874 }
875
876 SIAtomicScope Scope = SIAtomicScope::NONE;
877 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
878 bool IsCrossAddressSpaceOrdering = false;
879 if (Ordering != AtomicOrdering::NotAtomic) {
880 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
881 if (!ScopeOrNone) {
882 reportUnsupported(MI, "Unsupported atomic synchronization scope");
883 return std::nullopt;
884 }
885 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
886 *ScopeOrNone;
887 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
888 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
889 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
890 reportUnsupported(MI, "Unsupported atomic address space");
891 return std::nullopt;
892 }
893 }
894 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
895 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
896 IsNonTemporal, IsLastUse, IsCooperative);
897}
898
899std::optional<SIMemOpInfo>
900SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
901 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
902
903 if (!(MI->mayLoad() && !MI->mayStore()))
904 return std::nullopt;
905
906 // Be conservative if there are no memory operands.
907 if (MI->getNumMemOperands() == 0)
908 return SIMemOpInfo(ST);
909
910 return constructFromMIWithMMO(MI);
911}
912
913std::optional<SIMemOpInfo>
914SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
915 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
916
917 if (!(!MI->mayLoad() && MI->mayStore()))
918 return std::nullopt;
919
920 // Be conservative if there are no memory operands.
921 if (MI->getNumMemOperands() == 0)
922 return SIMemOpInfo(ST);
923
924 return constructFromMIWithMMO(MI);
925}
926
927std::optional<SIMemOpInfo>
928SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
929 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
930
931 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
932 return std::nullopt;
933
935 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
936
937 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
938 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
939 if (!ScopeOrNone) {
940 reportUnsupported(MI, "Unsupported atomic synchronization scope");
941 return std::nullopt;
942 }
943
944 SIAtomicScope Scope = SIAtomicScope::NONE;
945 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
946 bool IsCrossAddressSpaceOrdering = false;
947 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
948 *ScopeOrNone;
949
950 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
951 // We currently expect refineOrderingAS to be the only place that
952 // can refine the AS ordered by the fence.
953 // If that changes, we need to review the semantics of that function
954 // in case it needs to preserve certain address spaces.
955 reportUnsupported(MI, "Unsupported atomic address space");
956 return std::nullopt;
957 }
958
959 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
960 if (SynchronizeAS)
961 OrderingAddrSpace = *SynchronizeAS;
962
963 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
964 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
965 AtomicOrdering::NotAtomic);
966}
967
968std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
969 const MachineBasicBlock::iterator &MI) const {
970 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
971
972 if (!(MI->mayLoad() && MI->mayStore()))
973 return std::nullopt;
974
975 // Be conservative if there are no memory operands.
976 if (MI->getNumMemOperands() == 0)
977 return SIMemOpInfo(ST);
978
979 return constructFromMIWithMMO(MI);
980}
981
982SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
983 TII = ST.getInstrInfo();
984 IV = getIsaVersion(ST.getCPU());
985 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
986}
987
988bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
989 AMDGPU::CPol::CPol Bit) const {
990 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
991 if (!CPol)
992 return false;
993
994 CPol->setImm(CPol->getImm() | Bit);
995 return true;
996}
997
998/* static */
999std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
1000 GCNSubtarget::Generation Generation = ST.getGeneration();
1001 if (ST.hasGFX940Insts())
1002 return std::make_unique<SIGfx940CacheControl>(ST);
1003 if (ST.hasGFX90AInsts())
1004 return std::make_unique<SIGfx90ACacheControl>(ST);
1005 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1006 return std::make_unique<SIGfx6CacheControl>(ST);
1007 if (Generation < AMDGPUSubtarget::GFX10)
1008 return std::make_unique<SIGfx7CacheControl>(ST);
1009 if (Generation < AMDGPUSubtarget::GFX11)
1010 return std::make_unique<SIGfx10CacheControl>(ST);
1011 if (Generation < AMDGPUSubtarget::GFX12)
1012 return std::make_unique<SIGfx11CacheControl>(ST);
1013 return std::make_unique<SIGfx12CacheControl>(ST);
1014}
1015
1016bool SIGfx6CacheControl::enableLoadCacheBypass(
1018 SIAtomicScope Scope,
1019 SIAtomicAddrSpace AddrSpace) const {
1020 assert(MI->mayLoad() && !MI->mayStore());
1021 bool Changed = false;
1022
1023 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1024 switch (Scope) {
1025 case SIAtomicScope::SYSTEM:
1026 case SIAtomicScope::AGENT:
1027 // Set L1 cache policy to MISS_EVICT.
1028 // Note: there is no L2 cache bypass policy at the ISA level.
1029 Changed |= enableGLCBit(MI);
1030 break;
1031 case SIAtomicScope::WORKGROUP:
1032 case SIAtomicScope::WAVEFRONT:
1033 case SIAtomicScope::SINGLETHREAD:
1034 // No cache to bypass.
1035 break;
1036 default:
1037 llvm_unreachable("Unsupported synchronization scope");
1038 }
1039 }
1040
1041 /// The scratch address space does not need the global memory caches
1042 /// to be bypassed as all memory operations by the same thread are
1043 /// sequentially consistent, and no other thread can access scratch
1044 /// memory.
1045
1046 /// Other address spaces do not have a cache.
1047
1048 return Changed;
1049}
1050
1051bool SIGfx6CacheControl::enableStoreCacheBypass(
1053 SIAtomicScope Scope,
1054 SIAtomicAddrSpace AddrSpace) const {
1055 assert(!MI->mayLoad() && MI->mayStore());
1056 bool Changed = false;
1057
1058 /// The L1 cache is write through so does not need to be bypassed. There is no
1059 /// bypass control for the L2 cache at the isa level.
1060
1061 return Changed;
1062}
1063
1064bool SIGfx6CacheControl::enableRMWCacheBypass(
1066 SIAtomicScope Scope,
1067 SIAtomicAddrSpace AddrSpace) const {
1068 assert(MI->mayLoad() && MI->mayStore());
1069 bool Changed = false;
1070
1071 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1072 /// bypassed, and the GLC bit is instead used to indicate if they are
1073 /// return or no-return.
1074 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1075
1076 return Changed;
1077}
1078
1079bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1080 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1081 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1082 // Only handle load and store, not atomic read-modify-write insructions. The
1083 // latter use glc to indicate if the atomic returns a result and so must not
1084 // be used for cache control.
1085 assert(MI->mayLoad() ^ MI->mayStore());
1086
1087 // Only update load and store, not LLVM IR atomic read-modify-write
1088 // instructions. The latter are always marked as volatile so cannot sensibly
1089 // handle it as do not want to pessimize all atomics. Also they do not support
1090 // the nontemporal attribute.
1091 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1092
1093 bool Changed = false;
1094
1095 if (IsVolatile) {
1096 // Set L1 cache policy to be MISS_EVICT for load instructions
1097 // and MISS_LRU for store instructions.
1098 // Note: there is no L2 cache bypass policy at the ISA level.
1099 if (Op == SIMemOp::LOAD)
1100 Changed |= enableGLCBit(MI);
1101
1102 // Ensure operation has completed at system scope to cause all volatile
1103 // operations to be visible outside the program in a global order. Do not
1104 // request cross address space as only the global address space can be
1105 // observable outside the program, so no need to cause a waitcnt for LDS
1106 // address space operations.
1107 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1108 Position::AFTER, AtomicOrdering::Unordered);
1109
1110 return Changed;
1111 }
1112
1113 if (IsNonTemporal) {
1114 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1115 // for both loads and stores, and the L2 cache policy to STREAM.
1116 Changed |= enableGLCBit(MI);
1117 Changed |= enableSLCBit(MI);
1118 return Changed;
1119 }
1120
1121 return Changed;
1122}
1123
1124bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1125 SIAtomicScope Scope,
1126 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1127 bool IsCrossAddrSpaceOrdering, Position Pos,
1128 AtomicOrdering Order) const {
1129 bool Changed = false;
1130
1131 MachineBasicBlock &MBB = *MI->getParent();
1132 DebugLoc DL = MI->getDebugLoc();
1133
1134 if (Pos == Position::AFTER)
1135 ++MI;
1136
1137 bool VMCnt = false;
1138 bool LGKMCnt = false;
1139
1140 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1141 SIAtomicAddrSpace::NONE) {
1142 switch (Scope) {
1143 case SIAtomicScope::SYSTEM:
1144 case SIAtomicScope::AGENT:
1145 VMCnt |= true;
1146 break;
1147 case SIAtomicScope::WORKGROUP:
1148 case SIAtomicScope::WAVEFRONT:
1149 case SIAtomicScope::SINGLETHREAD:
1150 // The L1 cache keeps all memory operations in order for
1151 // wavefronts in the same work-group.
1152 break;
1153 default:
1154 llvm_unreachable("Unsupported synchronization scope");
1155 }
1156 }
1157
1158 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1159 switch (Scope) {
1160 case SIAtomicScope::SYSTEM:
1161 case SIAtomicScope::AGENT:
1162 case SIAtomicScope::WORKGROUP:
1163 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1164 // not needed as LDS operations for all waves are executed in a total
1165 // global ordering as observed by all waves. Required if also
1166 // synchronizing with global/GDS memory as LDS operations could be
1167 // reordered with respect to later global/GDS memory operations of the
1168 // same wave.
1169 LGKMCnt |= IsCrossAddrSpaceOrdering;
1170 break;
1171 case SIAtomicScope::WAVEFRONT:
1172 case SIAtomicScope::SINGLETHREAD:
1173 // The LDS keeps all memory operations in order for
1174 // the same wavefront.
1175 break;
1176 default:
1177 llvm_unreachable("Unsupported synchronization scope");
1178 }
1179 }
1180
1181 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1182 switch (Scope) {
1183 case SIAtomicScope::SYSTEM:
1184 case SIAtomicScope::AGENT:
1185 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1186 // is not needed as GDS operations for all waves are executed in a total
1187 // global ordering as observed by all waves. Required if also
1188 // synchronizing with global/LDS memory as GDS operations could be
1189 // reordered with respect to later global/LDS memory operations of the
1190 // same wave.
1191 LGKMCnt |= IsCrossAddrSpaceOrdering;
1192 break;
1193 case SIAtomicScope::WORKGROUP:
1194 case SIAtomicScope::WAVEFRONT:
1195 case SIAtomicScope::SINGLETHREAD:
1196 // The GDS keeps all memory operations in order for
1197 // the same work-group.
1198 break;
1199 default:
1200 llvm_unreachable("Unsupported synchronization scope");
1201 }
1202 }
1203
1204 if (VMCnt || LGKMCnt) {
1205 unsigned WaitCntImmediate =
1207 VMCnt ? 0 : getVmcntBitMask(IV),
1209 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1210 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1211 .addImm(WaitCntImmediate);
1212 Changed = true;
1213 }
1214
1215 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1216 // at workgroup-scoped release operations that specify the LDS address space.
1217 // SIInsertWaitcnts will later replace this with a vmcnt().
1218 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1219 Scope == SIAtomicScope::WORKGROUP &&
1220 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1221 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1222 Changed = true;
1223 }
1224
1225 if (Pos == Position::AFTER)
1226 --MI;
1227
1228 return Changed;
1229}
1230
1231bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1232 SIAtomicScope Scope,
1233 SIAtomicAddrSpace AddrSpace,
1234 Position Pos) const {
1235 if (!InsertCacheInv)
1236 return false;
1237
1238 bool Changed = false;
1239
1240 MachineBasicBlock &MBB = *MI->getParent();
1241 DebugLoc DL = MI->getDebugLoc();
1242
1243 if (Pos == Position::AFTER)
1244 ++MI;
1245
1246 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1247 switch (Scope) {
1248 case SIAtomicScope::SYSTEM:
1249 case SIAtomicScope::AGENT:
1250 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1251 Changed = true;
1252 break;
1253 case SIAtomicScope::WORKGROUP:
1254 case SIAtomicScope::WAVEFRONT:
1255 case SIAtomicScope::SINGLETHREAD:
1256 // No cache to invalidate.
1257 break;
1258 default:
1259 llvm_unreachable("Unsupported synchronization scope");
1260 }
1261 }
1262
1263 /// The scratch address space does not need the global memory cache
1264 /// to be flushed as all memory operations by the same thread are
1265 /// sequentially consistent, and no other thread can access scratch
1266 /// memory.
1267
1268 /// Other address spaces do not have a cache.
1269
1270 if (Pos == Position::AFTER)
1271 --MI;
1272
1273 return Changed;
1274}
1275
1276bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1277 SIAtomicScope Scope,
1278 SIAtomicAddrSpace AddrSpace,
1279 bool IsCrossAddrSpaceOrdering,
1280 Position Pos) const {
1281 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1282 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1283}
1284
1285bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1286 SIAtomicScope Scope,
1287 SIAtomicAddrSpace AddrSpace,
1288 Position Pos) const {
1289 if (!InsertCacheInv)
1290 return false;
1291
1292 bool Changed = false;
1293
1294 MachineBasicBlock &MBB = *MI->getParent();
1295 DebugLoc DL = MI->getDebugLoc();
1296
1297 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1298
1299 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1300 ? AMDGPU::BUFFER_WBINVL1
1301 : AMDGPU::BUFFER_WBINVL1_VOL;
1302
1303 if (Pos == Position::AFTER)
1304 ++MI;
1305
1306 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1307 switch (Scope) {
1308 case SIAtomicScope::SYSTEM:
1309 case SIAtomicScope::AGENT:
1310 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1311 Changed = true;
1312 break;
1313 case SIAtomicScope::WORKGROUP:
1314 case SIAtomicScope::WAVEFRONT:
1315 case SIAtomicScope::SINGLETHREAD:
1316 // No cache to invalidate.
1317 break;
1318 default:
1319 llvm_unreachable("Unsupported synchronization scope");
1320 }
1321 }
1322
1323 /// The scratch address space does not need the global memory cache
1324 /// to be flushed as all memory operations by the same thread are
1325 /// sequentially consistent, and no other thread can access scratch
1326 /// memory.
1327
1328 /// Other address spaces do not have a cache.
1329
1330 if (Pos == Position::AFTER)
1331 --MI;
1332
1333 return Changed;
1334}
1335
1336bool SIGfx90ACacheControl::enableLoadCacheBypass(
1338 SIAtomicScope Scope,
1339 SIAtomicAddrSpace AddrSpace) const {
1340 assert(MI->mayLoad() && !MI->mayStore());
1341 bool Changed = false;
1342
1343 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1344 switch (Scope) {
1345 case SIAtomicScope::SYSTEM:
1346 case SIAtomicScope::AGENT:
1347 // Set the L1 cache policy to MISS_LRU.
1348 // Note: there is no L2 cache bypass policy at the ISA level.
1349 Changed |= enableGLCBit(MI);
1350 break;
1351 case SIAtomicScope::WORKGROUP:
1352 // In threadgroup split mode the waves of a work-group can be executing on
1353 // different CUs. Therefore need to bypass the L1 which is per CU.
1354 // Otherwise in non-threadgroup split mode all waves of a work-group are
1355 // on the same CU, and so the L1 does not need to be bypassed.
1356 if (ST.isTgSplitEnabled())
1357 Changed |= enableGLCBit(MI);
1358 break;
1359 case SIAtomicScope::WAVEFRONT:
1360 case SIAtomicScope::SINGLETHREAD:
1361 // No cache to bypass.
1362 break;
1363 default:
1364 llvm_unreachable("Unsupported synchronization scope");
1365 }
1366 }
1367
1368 /// The scratch address space does not need the global memory caches
1369 /// to be bypassed as all memory operations by the same thread are
1370 /// sequentially consistent, and no other thread can access scratch
1371 /// memory.
1372
1373 /// Other address spaces do not have a cache.
1374
1375 return Changed;
1376}
1377
1378bool SIGfx90ACacheControl::enableStoreCacheBypass(
1380 SIAtomicScope Scope,
1381 SIAtomicAddrSpace AddrSpace) const {
1382 assert(!MI->mayLoad() && MI->mayStore());
1383 bool Changed = false;
1384
1385 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1386 switch (Scope) {
1387 case SIAtomicScope::SYSTEM:
1388 case SIAtomicScope::AGENT:
1389 /// Do not set glc for store atomic operations as they implicitly write
1390 /// through the L1 cache.
1391 break;
1392 case SIAtomicScope::WORKGROUP:
1393 case SIAtomicScope::WAVEFRONT:
1394 case SIAtomicScope::SINGLETHREAD:
1395 // No cache to bypass. Store atomics implicitly write through the L1
1396 // cache.
1397 break;
1398 default:
1399 llvm_unreachable("Unsupported synchronization scope");
1400 }
1401 }
1402
1403 /// The scratch address space does not need the global memory caches
1404 /// to be bypassed as all memory operations by the same thread are
1405 /// sequentially consistent, and no other thread can access scratch
1406 /// memory.
1407
1408 /// Other address spaces do not have a cache.
1409
1410 return Changed;
1411}
1412
1413bool SIGfx90ACacheControl::enableRMWCacheBypass(
1415 SIAtomicScope Scope,
1416 SIAtomicAddrSpace AddrSpace) const {
1417 assert(MI->mayLoad() && MI->mayStore());
1418 bool Changed = false;
1419
1420 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1421 switch (Scope) {
1422 case SIAtomicScope::SYSTEM:
1423 case SIAtomicScope::AGENT:
1424 /// Do not set glc for RMW atomic operations as they implicitly bypass
1425 /// the L1 cache, and the glc bit is instead used to indicate if they are
1426 /// return or no-return.
1427 break;
1428 case SIAtomicScope::WORKGROUP:
1429 case SIAtomicScope::WAVEFRONT:
1430 case SIAtomicScope::SINGLETHREAD:
1431 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1432 break;
1433 default:
1434 llvm_unreachable("Unsupported synchronization scope");
1435 }
1436 }
1437
1438 return Changed;
1439}
1440
1441bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1442 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1443 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1444 // Only handle load and store, not atomic read-modify-write insructions. The
1445 // latter use glc to indicate if the atomic returns a result and so must not
1446 // be used for cache control.
1447 assert(MI->mayLoad() ^ MI->mayStore());
1448
1449 // Only update load and store, not LLVM IR atomic read-modify-write
1450 // instructions. The latter are always marked as volatile so cannot sensibly
1451 // handle it as do not want to pessimize all atomics. Also they do not support
1452 // the nontemporal attribute.
1453 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1454
1455 bool Changed = false;
1456
1457 if (IsVolatile) {
1458 // Set L1 cache policy to be MISS_EVICT for load instructions
1459 // and MISS_LRU for store instructions.
1460 // Note: there is no L2 cache bypass policy at the ISA level.
1461 if (Op == SIMemOp::LOAD)
1462 Changed |= enableGLCBit(MI);
1463
1464 // Ensure operation has completed at system scope to cause all volatile
1465 // operations to be visible outside the program in a global order. Do not
1466 // request cross address space as only the global address space can be
1467 // observable outside the program, so no need to cause a waitcnt for LDS
1468 // address space operations.
1469 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1470 Position::AFTER, AtomicOrdering::Unordered);
1471
1472 return Changed;
1473 }
1474
1475 if (IsNonTemporal) {
1476 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1477 // for both loads and stores, and the L2 cache policy to STREAM.
1478 Changed |= enableGLCBit(MI);
1479 Changed |= enableSLCBit(MI);
1480 return Changed;
1481 }
1482
1483 return Changed;
1484}
1485
1486bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1487 SIAtomicScope Scope,
1488 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1489 bool IsCrossAddrSpaceOrdering,
1490 Position Pos,
1491 AtomicOrdering Order) const {
1492 if (ST.isTgSplitEnabled()) {
1493 // In threadgroup split mode the waves of a work-group can be executing on
1494 // different CUs. Therefore need to wait for global or GDS memory operations
1495 // to complete to ensure they are visible to waves in the other CUs.
1496 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1497 // the same CU, so no need to wait for global memory as all waves in the
1498 // work-group access the same the L1, nor wait for GDS as access are ordered
1499 // on a CU.
1500 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1501 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1502 (Scope == SIAtomicScope::WORKGROUP)) {
1503 // Same as GFX7 using agent scope.
1504 Scope = SIAtomicScope::AGENT;
1505 }
1506 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1507 // LDS memory operations.
1508 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1509 }
1510 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1511 IsCrossAddrSpaceOrdering, Pos, Order);
1512}
1513
1514bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1515 SIAtomicScope Scope,
1516 SIAtomicAddrSpace AddrSpace,
1517 Position Pos) const {
1518 if (!InsertCacheInv)
1519 return false;
1520
1521 bool Changed = false;
1522
1523 MachineBasicBlock &MBB = *MI->getParent();
1524 DebugLoc DL = MI->getDebugLoc();
1525
1526 if (Pos == Position::AFTER)
1527 ++MI;
1528
1529 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1530 switch (Scope) {
1531 case SIAtomicScope::SYSTEM:
1532 // Ensures that following loads will not see stale remote VMEM data or
1533 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1534 // CC will never be stale due to the local memory probes.
1535 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1536 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1537 // hardware does not reorder memory operations by the same wave with
1538 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1539 // remove any cache lines of earlier writes by the same wave and ensures
1540 // later reads by the same wave will refetch the cache lines.
1541 Changed = true;
1542 break;
1543 case SIAtomicScope::AGENT:
1544 // Same as GFX7.
1545 break;
1546 case SIAtomicScope::WORKGROUP:
1547 // In threadgroup split mode the waves of a work-group can be executing on
1548 // different CUs. Therefore need to invalidate the L1 which is per CU.
1549 // Otherwise in non-threadgroup split mode all waves of a work-group are
1550 // on the same CU, and so the L1 does not need to be invalidated.
1551 if (ST.isTgSplitEnabled()) {
1552 // Same as GFX7 using agent scope.
1553 Scope = SIAtomicScope::AGENT;
1554 }
1555 break;
1556 case SIAtomicScope::WAVEFRONT:
1557 case SIAtomicScope::SINGLETHREAD:
1558 // Same as GFX7.
1559 break;
1560 default:
1561 llvm_unreachable("Unsupported synchronization scope");
1562 }
1563 }
1564
1565 /// The scratch address space does not need the global memory cache
1566 /// to be flushed as all memory operations by the same thread are
1567 /// sequentially consistent, and no other thread can access scratch
1568 /// memory.
1569
1570 /// Other address spaces do not have a cache.
1571
1572 if (Pos == Position::AFTER)
1573 --MI;
1574
1575 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1576
1577 return Changed;
1578}
1579
1580bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1581 SIAtomicScope Scope,
1582 SIAtomicAddrSpace AddrSpace,
1583 bool IsCrossAddrSpaceOrdering,
1584 Position Pos) const {
1585 bool Changed = false;
1586
1587 MachineBasicBlock &MBB = *MI->getParent();
1588 const DebugLoc &DL = MI->getDebugLoc();
1589
1590 if (Pos == Position::AFTER)
1591 ++MI;
1592
1593 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1594 switch (Scope) {
1595 case SIAtomicScope::SYSTEM:
1596 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1597 // hardware does not reorder memory operations by the same wave with
1598 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1599 // to initiate writeback of any dirty cache lines of earlier writes by the
1600 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1601 // writeback has completed.
1602 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1603 // Set SC bits to indicate system scope.
1605 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1606 // vmcnt(0)" needed by the "BUFFER_WBL2".
1607 Changed = true;
1608 break;
1609 case SIAtomicScope::AGENT:
1610 case SIAtomicScope::WORKGROUP:
1611 case SIAtomicScope::WAVEFRONT:
1612 case SIAtomicScope::SINGLETHREAD:
1613 // Same as GFX7.
1614 break;
1615 default:
1616 llvm_unreachable("Unsupported synchronization scope");
1617 }
1618 }
1619
1620 if (Pos == Position::AFTER)
1621 --MI;
1622
1623 Changed |=
1624 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1625 IsCrossAddrSpaceOrdering, Pos);
1626
1627 return Changed;
1628}
1629
1630bool SIGfx940CacheControl::enableLoadCacheBypass(
1631 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1632 SIAtomicAddrSpace AddrSpace) const {
1633 assert(MI->mayLoad() && !MI->mayStore());
1634 bool Changed = false;
1635
1636 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1637 switch (Scope) {
1638 case SIAtomicScope::SYSTEM:
1639 // Set SC bits to indicate system scope.
1640 Changed |= enableSC0Bit(MI);
1641 Changed |= enableSC1Bit(MI);
1642 break;
1643 case SIAtomicScope::AGENT:
1644 // Set SC bits to indicate agent scope.
1645 Changed |= enableSC1Bit(MI);
1646 break;
1647 case SIAtomicScope::WORKGROUP:
1648 // In threadgroup split mode the waves of a work-group can be executing on
1649 // different CUs. Therefore need to bypass the L1 which is per CU.
1650 // Otherwise in non-threadgroup split mode all waves of a work-group are
1651 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1652 // bits to indicate work-group scope will do this automatically.
1653 Changed |= enableSC0Bit(MI);
1654 break;
1655 case SIAtomicScope::WAVEFRONT:
1656 case SIAtomicScope::SINGLETHREAD:
1657 // Leave SC bits unset to indicate wavefront scope.
1658 break;
1659 default:
1660 llvm_unreachable("Unsupported synchronization scope");
1661 }
1662 }
1663
1664 /// The scratch address space does not need the global memory caches
1665 /// to be bypassed as all memory operations by the same thread are
1666 /// sequentially consistent, and no other thread can access scratch
1667 /// memory.
1668
1669 /// Other address spaces do not have a cache.
1670
1671 return Changed;
1672}
1673
1674bool SIGfx940CacheControl::enableStoreCacheBypass(
1676 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1677 assert(!MI->mayLoad() && MI->mayStore());
1678 bool Changed = false;
1679
1680 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1681 switch (Scope) {
1682 case SIAtomicScope::SYSTEM:
1683 // Set SC bits to indicate system scope.
1684 Changed |= enableSC0Bit(MI);
1685 Changed |= enableSC1Bit(MI);
1686 break;
1687 case SIAtomicScope::AGENT:
1688 // Set SC bits to indicate agent scope.
1689 Changed |= enableSC1Bit(MI);
1690 break;
1691 case SIAtomicScope::WORKGROUP:
1692 // Set SC bits to indicate workgroup scope.
1693 Changed |= enableSC0Bit(MI);
1694 break;
1695 case SIAtomicScope::WAVEFRONT:
1696 case SIAtomicScope::SINGLETHREAD:
1697 // Leave SC bits unset to indicate wavefront scope.
1698 break;
1699 default:
1700 llvm_unreachable("Unsupported synchronization scope");
1701 }
1702 }
1703
1704 /// The scratch address space does not need the global memory caches
1705 /// to be bypassed as all memory operations by the same thread are
1706 /// sequentially consistent, and no other thread can access scratch
1707 /// memory.
1708
1709 /// Other address spaces do not have a cache.
1710
1711 return Changed;
1712}
1713
1714bool SIGfx940CacheControl::enableRMWCacheBypass(
1715 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1716 SIAtomicAddrSpace AddrSpace) const {
1717 assert(MI->mayLoad() && MI->mayStore());
1718 bool Changed = false;
1719
1720 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1721 switch (Scope) {
1722 case SIAtomicScope::SYSTEM:
1723 // Set SC1 bit to indicate system scope.
1724 Changed |= enableSC1Bit(MI);
1725 break;
1726 case SIAtomicScope::AGENT:
1727 case SIAtomicScope::WORKGROUP:
1728 case SIAtomicScope::WAVEFRONT:
1729 case SIAtomicScope::SINGLETHREAD:
1730 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1731 // to indicate system or agent scope. The SC0 bit is used to indicate if
1732 // they are return or no-return. Leave SC1 bit unset to indicate agent
1733 // scope.
1734 break;
1735 default:
1736 llvm_unreachable("Unsupported synchronization scope");
1737 }
1738 }
1739
1740 return Changed;
1741}
1742
1743bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1744 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1745 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1746 // Only handle load and store, not atomic read-modify-write insructions. The
1747 // latter use glc to indicate if the atomic returns a result and so must not
1748 // be used for cache control.
1749 assert(MI->mayLoad() ^ MI->mayStore());
1750
1751 // Only update load and store, not LLVM IR atomic read-modify-write
1752 // instructions. The latter are always marked as volatile so cannot sensibly
1753 // handle it as do not want to pessimize all atomics. Also they do not support
1754 // the nontemporal attribute.
1755 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1756
1757 bool Changed = false;
1758
1759 if (IsVolatile) {
1760 // Set SC bits to indicate system scope.
1761 Changed |= enableSC0Bit(MI);
1762 Changed |= enableSC1Bit(MI);
1763
1764 // Ensure operation has completed at system scope to cause all volatile
1765 // operations to be visible outside the program in a global order. Do not
1766 // request cross address space as only the global address space can be
1767 // observable outside the program, so no need to cause a waitcnt for LDS
1768 // address space operations.
1769 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1770 Position::AFTER, AtomicOrdering::Unordered);
1771
1772 return Changed;
1773 }
1774
1775 if (IsNonTemporal) {
1776 Changed |= enableNTBit(MI);
1777 return Changed;
1778 }
1779
1780 return Changed;
1781}
1782
1783bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1784 SIAtomicScope Scope,
1785 SIAtomicAddrSpace AddrSpace,
1786 Position Pos) const {
1787 if (!InsertCacheInv)
1788 return false;
1789
1790 bool Changed = false;
1791
1792 MachineBasicBlock &MBB = *MI->getParent();
1793 DebugLoc DL = MI->getDebugLoc();
1794
1795 if (Pos == Position::AFTER)
1796 ++MI;
1797
1798 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1799 switch (Scope) {
1800 case SIAtomicScope::SYSTEM:
1801 // Ensures that following loads will not see stale remote VMEM data or
1802 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1803 // CC will never be stale due to the local memory probes.
1804 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1805 // Set SC bits to indicate system scope.
1807 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1808 // hardware does not reorder memory operations by the same wave with
1809 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1810 // remove any cache lines of earlier writes by the same wave and ensures
1811 // later reads by the same wave will refetch the cache lines.
1812 Changed = true;
1813 break;
1814 case SIAtomicScope::AGENT:
1815 // Ensures that following loads will not see stale remote date or local
1816 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1817 // due to the memory probes.
1818 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1819 // Set SC bits to indicate agent scope.
1821 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1822 // does not reorder memory operations with respect to preceeding buffer
1823 // invalidate. The invalidate is guaranteed to remove any cache lines of
1824 // earlier writes and ensures later writes will refetch the cache lines.
1825 Changed = true;
1826 break;
1827 case SIAtomicScope::WORKGROUP:
1828 // In threadgroup split mode the waves of a work-group can be executing on
1829 // different CUs. Therefore need to invalidate the L1 which is per CU.
1830 // Otherwise in non-threadgroup split mode all waves of a work-group are
1831 // on the same CU, and so the L1 does not need to be invalidated.
1832 if (ST.isTgSplitEnabled()) {
1833 // Ensures L1 is invalidated if in threadgroup split mode. In
1834 // non-threadgroup split mode it is a NOP, but no point generating it in
1835 // that case if know not in that mode.
1836 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1837 // Set SC bits to indicate work-group scope.
1839 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1840 // does not reorder memory operations with respect to preceeding buffer
1841 // invalidate. The invalidate is guaranteed to remove any cache lines of
1842 // earlier writes and ensures later writes will refetch the cache lines.
1843 Changed = true;
1844 }
1845 break;
1846 case SIAtomicScope::WAVEFRONT:
1847 case SIAtomicScope::SINGLETHREAD:
1848 // Could generate "BUFFER_INV" but it would do nothing as there are no
1849 // caches to invalidate.
1850 break;
1851 default:
1852 llvm_unreachable("Unsupported synchronization scope");
1853 }
1854 }
1855
1856 /// The scratch address space does not need the global memory cache
1857 /// to be flushed as all memory operations by the same thread are
1858 /// sequentially consistent, and no other thread can access scratch
1859 /// memory.
1860
1861 /// Other address spaces do not have a cache.
1862
1863 if (Pos == Position::AFTER)
1864 --MI;
1865
1866 return Changed;
1867}
1868
1869bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1870 SIAtomicScope Scope,
1871 SIAtomicAddrSpace AddrSpace,
1872 bool IsCrossAddrSpaceOrdering,
1873 Position Pos) const {
1874 bool Changed = false;
1875
1876 MachineBasicBlock &MBB = *MI->getParent();
1877 DebugLoc DL = MI->getDebugLoc();
1878
1879 if (Pos == Position::AFTER)
1880 ++MI;
1881
1882 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1883 switch (Scope) {
1884 case SIAtomicScope::SYSTEM:
1885 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1886 // hardware does not reorder memory operations by the same wave with
1887 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1888 // to initiate writeback of any dirty cache lines of earlier writes by the
1889 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1890 // writeback has completed.
1891 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1892 // Set SC bits to indicate system scope.
1894 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1895 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1896 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1897 Changed = true;
1898 break;
1899 case SIAtomicScope::AGENT:
1900 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1901 // Set SC bits to indicate agent scope.
1903
1904 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1905 // SIAtomicScope::AGENT, the following insertWait will generate the
1906 // required "S_WAITCNT vmcnt(0)".
1907 Changed = true;
1908 break;
1909 case SIAtomicScope::WORKGROUP:
1910 case SIAtomicScope::WAVEFRONT:
1911 case SIAtomicScope::SINGLETHREAD:
1912 // Do not generate "BUFFER_WBL2" as there are no caches it would
1913 // writeback, and would require an otherwise unnecessary
1914 // "S_WAITCNT vmcnt(0)".
1915 break;
1916 default:
1917 llvm_unreachable("Unsupported synchronization scope");
1918 }
1919 }
1920
1921 if (Pos == Position::AFTER)
1922 --MI;
1923
1924 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1925 // S_WAITCNT needed.
1926 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1927 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1928
1929 return Changed;
1930}
1931
1932bool SIGfx10CacheControl::enableLoadCacheBypass(
1934 SIAtomicScope Scope,
1935 SIAtomicAddrSpace AddrSpace) const {
1936 assert(MI->mayLoad() && !MI->mayStore());
1937 bool Changed = false;
1938
1939 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1940 switch (Scope) {
1941 case SIAtomicScope::SYSTEM:
1942 case SIAtomicScope::AGENT:
1943 // Set the L0 and L1 cache policies to MISS_EVICT.
1944 // Note: there is no L2 cache coherent bypass control at the ISA level.
1945 Changed |= enableGLCBit(MI);
1946 Changed |= enableDLCBit(MI);
1947 break;
1948 case SIAtomicScope::WORKGROUP:
1949 // In WGP mode the waves of a work-group can be executing on either CU of
1950 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1951 // CU mode all waves of a work-group are on the same CU, and so the L0
1952 // does not need to be bypassed.
1953 if (!ST.isCuModeEnabled())
1954 Changed |= enableGLCBit(MI);
1955 break;
1956 case SIAtomicScope::WAVEFRONT:
1957 case SIAtomicScope::SINGLETHREAD:
1958 // No cache to bypass.
1959 break;
1960 default:
1961 llvm_unreachable("Unsupported synchronization scope");
1962 }
1963 }
1964
1965 /// The scratch address space does not need the global memory caches
1966 /// to be bypassed as all memory operations by the same thread are
1967 /// sequentially consistent, and no other thread can access scratch
1968 /// memory.
1969
1970 /// Other address spaces do not have a cache.
1971
1972 return Changed;
1973}
1974
1975bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1976 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1977 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1978
1979 // Only handle load and store, not atomic read-modify-write insructions. The
1980 // latter use glc to indicate if the atomic returns a result and so must not
1981 // be used for cache control.
1982 assert(MI->mayLoad() ^ MI->mayStore());
1983
1984 // Only update load and store, not LLVM IR atomic read-modify-write
1985 // instructions. The latter are always marked as volatile so cannot sensibly
1986 // handle it as do not want to pessimize all atomics. Also they do not support
1987 // the nontemporal attribute.
1988 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1989
1990 bool Changed = false;
1991
1992 if (IsVolatile) {
1993 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1994 // and MISS_LRU for store instructions.
1995 // Note: there is no L2 cache coherent bypass control at the ISA level.
1996 if (Op == SIMemOp::LOAD) {
1997 Changed |= enableGLCBit(MI);
1998 Changed |= enableDLCBit(MI);
1999 }
2000
2001 // Ensure operation has completed at system scope to cause all volatile
2002 // operations to be visible outside the program in a global order. Do not
2003 // request cross address space as only the global address space can be
2004 // observable outside the program, so no need to cause a waitcnt for LDS
2005 // address space operations.
2006 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2007 Position::AFTER, AtomicOrdering::Unordered);
2008 return Changed;
2009 }
2010
2011 if (IsNonTemporal) {
2012 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2013 // and L2 cache policy to STREAM.
2014 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2015 // to MISS_EVICT and the L2 cache policy to STREAM.
2016 if (Op == SIMemOp::STORE)
2017 Changed |= enableGLCBit(MI);
2018 Changed |= enableSLCBit(MI);
2019
2020 return Changed;
2021 }
2022
2023 return Changed;
2024}
2025
2026bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2027 SIAtomicScope Scope,
2028 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2029 bool IsCrossAddrSpaceOrdering,
2030 Position Pos, AtomicOrdering Order) const {
2031 bool Changed = false;
2032
2033 MachineBasicBlock &MBB = *MI->getParent();
2034 DebugLoc DL = MI->getDebugLoc();
2035
2036 if (Pos == Position::AFTER)
2037 ++MI;
2038
2039 bool VMCnt = false;
2040 bool VSCnt = false;
2041 bool LGKMCnt = false;
2042
2043 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2044 SIAtomicAddrSpace::NONE) {
2045 switch (Scope) {
2046 case SIAtomicScope::SYSTEM:
2047 case SIAtomicScope::AGENT:
2048 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2049 VMCnt |= true;
2050 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2051 VSCnt |= true;
2052 break;
2053 case SIAtomicScope::WORKGROUP:
2054 // In WGP mode the waves of a work-group can be executing on either CU of
2055 // the WGP. Therefore need to wait for operations to complete to ensure
2056 // they are visible to waves in the other CU as the L0 is per CU.
2057 // Otherwise in CU mode and all waves of a work-group are on the same CU
2058 // which shares the same L0.
2059 if (!ST.isCuModeEnabled()) {
2060 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2061 VMCnt |= true;
2062 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2063 VSCnt |= true;
2064 }
2065 break;
2066 case SIAtomicScope::WAVEFRONT:
2067 case SIAtomicScope::SINGLETHREAD:
2068 // The L0 cache keeps all memory operations in order for
2069 // work-items in the same wavefront.
2070 break;
2071 default:
2072 llvm_unreachable("Unsupported synchronization scope");
2073 }
2074 }
2075
2076 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2077 switch (Scope) {
2078 case SIAtomicScope::SYSTEM:
2079 case SIAtomicScope::AGENT:
2080 case SIAtomicScope::WORKGROUP:
2081 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2082 // not needed as LDS operations for all waves are executed in a total
2083 // global ordering as observed by all waves. Required if also
2084 // synchronizing with global/GDS memory as LDS operations could be
2085 // reordered with respect to later global/GDS memory operations of the
2086 // same wave.
2087 LGKMCnt |= IsCrossAddrSpaceOrdering;
2088 break;
2089 case SIAtomicScope::WAVEFRONT:
2090 case SIAtomicScope::SINGLETHREAD:
2091 // The LDS keeps all memory operations in order for
2092 // the same wavefront.
2093 break;
2094 default:
2095 llvm_unreachable("Unsupported synchronization scope");
2096 }
2097 }
2098
2099 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2100 switch (Scope) {
2101 case SIAtomicScope::SYSTEM:
2102 case SIAtomicScope::AGENT:
2103 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2104 // is not needed as GDS operations for all waves are executed in a total
2105 // global ordering as observed by all waves. Required if also
2106 // synchronizing with global/LDS memory as GDS operations could be
2107 // reordered with respect to later global/LDS memory operations of the
2108 // same wave.
2109 LGKMCnt |= IsCrossAddrSpaceOrdering;
2110 break;
2111 case SIAtomicScope::WORKGROUP:
2112 case SIAtomicScope::WAVEFRONT:
2113 case SIAtomicScope::SINGLETHREAD:
2114 // The GDS keeps all memory operations in order for
2115 // the same work-group.
2116 break;
2117 default:
2118 llvm_unreachable("Unsupported synchronization scope");
2119 }
2120 }
2121
2122 if (VMCnt || LGKMCnt) {
2123 unsigned WaitCntImmediate =
2125 VMCnt ? 0 : getVmcntBitMask(IV),
2127 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2128 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2129 .addImm(WaitCntImmediate);
2130 Changed = true;
2131 }
2132
2133 // On architectures that support direct loads to LDS, emit an unknown waitcnt
2134 // at workgroup-scoped release operations that specify the LDS address space.
2135 // SIInsertWaitcnts will later replace this with a vmcnt().
2136 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
2137 Scope == SIAtomicScope::WORKGROUP &&
2138 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2139 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
2140 Changed = true;
2141 }
2142
2143 if (VSCnt) {
2144 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2145 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2146 .addImm(0);
2147 Changed = true;
2148 }
2149
2150 if (Pos == Position::AFTER)
2151 --MI;
2152
2153 return Changed;
2154}
2155
2156bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2157 SIAtomicScope Scope,
2158 SIAtomicAddrSpace AddrSpace,
2159 Position Pos) const {
2160 if (!InsertCacheInv)
2161 return false;
2162
2163 bool Changed = false;
2164
2165 MachineBasicBlock &MBB = *MI->getParent();
2166 DebugLoc DL = MI->getDebugLoc();
2167
2168 if (Pos == Position::AFTER)
2169 ++MI;
2170
2171 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2172 switch (Scope) {
2173 case SIAtomicScope::SYSTEM:
2174 case SIAtomicScope::AGENT:
2175 // The order of invalidates matter here. We must invalidate "outer in"
2176 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2177 // invalidated.
2178 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2179 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2180 Changed = true;
2181 break;
2182 case SIAtomicScope::WORKGROUP:
2183 // In WGP mode the waves of a work-group can be executing on either CU of
2184 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2185 // in CU mode and all waves of a work-group are on the same CU, and so the
2186 // L0 does not need to be invalidated.
2187 if (!ST.isCuModeEnabled()) {
2188 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2189 Changed = true;
2190 }
2191 break;
2192 case SIAtomicScope::WAVEFRONT:
2193 case SIAtomicScope::SINGLETHREAD:
2194 // No cache to invalidate.
2195 break;
2196 default:
2197 llvm_unreachable("Unsupported synchronization scope");
2198 }
2199 }
2200
2201 /// The scratch address space does not need the global memory cache
2202 /// to be flushed as all memory operations by the same thread are
2203 /// sequentially consistent, and no other thread can access scratch
2204 /// memory.
2205
2206 /// Other address spaces do not have a cache.
2207
2208 if (Pos == Position::AFTER)
2209 --MI;
2210
2211 return Changed;
2212}
2213
2214bool SIGfx10CacheControl::insertBarrierStart(
2216 // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
2217 // mode. This is because a CU mode release fence does not emit any wait, which
2218 // is fine when only dealing with vmem, but isn't sufficient in the presence
2219 // of barriers which do not go through vmem.
2220 // GFX12.5 does not require this additional wait.
2221 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
2222 return false;
2223
2224 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2225 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
2227 return true;
2228}
2229
2230bool SIGfx11CacheControl::enableLoadCacheBypass(
2231 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2232 SIAtomicAddrSpace AddrSpace) const {
2233 assert(MI->mayLoad() && !MI->mayStore());
2234 bool Changed = false;
2235
2236 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2237 switch (Scope) {
2238 case SIAtomicScope::SYSTEM:
2239 case SIAtomicScope::AGENT:
2240 // Set the L0 and L1 cache policies to MISS_EVICT.
2241 // Note: there is no L2 cache coherent bypass control at the ISA level.
2242 Changed |= enableGLCBit(MI);
2243 break;
2244 case SIAtomicScope::WORKGROUP:
2245 // In WGP mode the waves of a work-group can be executing on either CU of
2246 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2247 // CU mode all waves of a work-group are on the same CU, and so the L0
2248 // does not need to be bypassed.
2249 if (!ST.isCuModeEnabled())
2250 Changed |= enableGLCBit(MI);
2251 break;
2252 case SIAtomicScope::WAVEFRONT:
2253 case SIAtomicScope::SINGLETHREAD:
2254 // No cache to bypass.
2255 break;
2256 default:
2257 llvm_unreachable("Unsupported synchronization scope");
2258 }
2259 }
2260
2261 /// The scratch address space does not need the global memory caches
2262 /// to be bypassed as all memory operations by the same thread are
2263 /// sequentially consistent, and no other thread can access scratch
2264 /// memory.
2265
2266 /// Other address spaces do not have a cache.
2267
2268 return Changed;
2269}
2270
2271bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2272 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2273 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2274
2275 // Only handle load and store, not atomic read-modify-write insructions. The
2276 // latter use glc to indicate if the atomic returns a result and so must not
2277 // be used for cache control.
2278 assert(MI->mayLoad() ^ MI->mayStore());
2279
2280 // Only update load and store, not LLVM IR atomic read-modify-write
2281 // instructions. The latter are always marked as volatile so cannot sensibly
2282 // handle it as do not want to pessimize all atomics. Also they do not support
2283 // the nontemporal attribute.
2284 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2285
2286 bool Changed = false;
2287
2288 if (IsVolatile) {
2289 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2290 // and MISS_LRU for store instructions.
2291 // Note: there is no L2 cache coherent bypass control at the ISA level.
2292 if (Op == SIMemOp::LOAD)
2293 Changed |= enableGLCBit(MI);
2294
2295 // Set MALL NOALLOC for load and store instructions.
2296 Changed |= enableDLCBit(MI);
2297
2298 // Ensure operation has completed at system scope to cause all volatile
2299 // operations to be visible outside the program in a global order. Do not
2300 // request cross address space as only the global address space can be
2301 // observable outside the program, so no need to cause a waitcnt for LDS
2302 // address space operations.
2303 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2304 Position::AFTER, AtomicOrdering::Unordered);
2305 return Changed;
2306 }
2307
2308 if (IsNonTemporal) {
2309 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2310 // and L2 cache policy to STREAM.
2311 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2312 // to MISS_EVICT and the L2 cache policy to STREAM.
2313 if (Op == SIMemOp::STORE)
2314 Changed |= enableGLCBit(MI);
2315 Changed |= enableSLCBit(MI);
2316
2317 // Set MALL NOALLOC for load and store instructions.
2318 Changed |= enableDLCBit(MI);
2319 return Changed;
2320 }
2321
2322 return Changed;
2323}
2324
2325bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2326 AMDGPU::CPol::CPol Value) const {
2327 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2328 if (!CPol)
2329 return false;
2330
2331 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2332 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2333 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2334 return true;
2335 }
2336
2337 return false;
2338}
2339
2340bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2341 AMDGPU::CPol::CPol Value) const {
2342 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2343 if (!CPol)
2344 return false;
2345
2346 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2347 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2348 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2349 return true;
2350 }
2351
2352 return false;
2353}
2354
2355bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2356 const MachineBasicBlock::iterator MI) const {
2357 // TODO: implement flag for frontend to give us a hint not to insert waits.
2358
2359 MachineBasicBlock &MBB = *MI->getParent();
2360 const DebugLoc &DL = MI->getDebugLoc();
2361
2362 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2363 if (ST.hasImageInsts()) {
2364 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2365 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2366 }
2367 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2368 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2369
2370 return true;
2371}
2372
2373bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2374 SIAtomicScope Scope,
2375 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2376 bool IsCrossAddrSpaceOrdering,
2377 Position Pos, AtomicOrdering Order) const {
2378 bool Changed = false;
2379
2380 MachineBasicBlock &MBB = *MI->getParent();
2381 DebugLoc DL = MI->getDebugLoc();
2382
2383 bool LOADCnt = false;
2384 bool DSCnt = false;
2385 bool STORECnt = false;
2386
2387 if (Pos == Position::AFTER)
2388 ++MI;
2389
2390 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2391 SIAtomicAddrSpace::NONE) {
2392 switch (Scope) {
2393 case SIAtomicScope::SYSTEM:
2394 case SIAtomicScope::AGENT:
2395 case SIAtomicScope::CLUSTER:
2396 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2397 LOADCnt |= true;
2398 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2399 STORECnt |= true;
2400 break;
2401 case SIAtomicScope::WORKGROUP:
2402 // GFX12.0:
2403 // In WGP mode the waves of a work-group can be executing on either CU
2404 // of the WGP. Therefore need to wait for operations to complete to
2405 // ensure they are visible to waves in the other CU as the L0 is per CU.
2406 // Otherwise in CU mode and all waves of a work-group are on the same CU
2407 // which shares the same L0.
2408 //
2409 // GFX12.5:
2410 // TODO DOCS
2411 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
2412 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2413 LOADCnt |= true;
2414 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2415 STORECnt |= true;
2416 }
2417 break;
2418 case SIAtomicScope::WAVEFRONT:
2419 case SIAtomicScope::SINGLETHREAD:
2420 // The L0 cache keeps all memory operations in order for
2421 // work-items in the same wavefront.
2422 break;
2423 default:
2424 llvm_unreachable("Unsupported synchronization scope");
2425 }
2426 }
2427
2428 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2429 switch (Scope) {
2430 case SIAtomicScope::SYSTEM:
2431 case SIAtomicScope::AGENT:
2432 case SIAtomicScope::CLUSTER:
2433 case SIAtomicScope::WORKGROUP:
2434 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2435 // not needed as LDS operations for all waves are executed in a total
2436 // global ordering as observed by all waves. Required if also
2437 // synchronizing with global/GDS memory as LDS operations could be
2438 // reordered with respect to later global/GDS memory operations of the
2439 // same wave.
2440 DSCnt |= IsCrossAddrSpaceOrdering;
2441 break;
2442 case SIAtomicScope::WAVEFRONT:
2443 case SIAtomicScope::SINGLETHREAD:
2444 // The LDS keeps all memory operations in order for
2445 // the same wavefront.
2446 break;
2447 default:
2448 llvm_unreachable("Unsupported synchronization scope");
2449 }
2450 }
2451
2452 if (LOADCnt) {
2453 // Acquire sequences only need to wait on the previous atomic operation.
2454 // e.g. a typical sequence looks like
2455 // atomic load
2456 // (wait)
2457 // global_inv
2458 //
2459 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2460 // to be tracked using loadcnt.
2461 //
2462 // This also applies to fences. Fences cannot pair with an instruction
2463 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2464 if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
2465 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2466 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2467 }
2468 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2469 Changed = true;
2470 }
2471
2472 if (STORECnt) {
2473 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2474 Changed = true;
2475 }
2476
2477 if (DSCnt) {
2478 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2479 Changed = true;
2480 }
2481
2482 if (Pos == Position::AFTER)
2483 --MI;
2484
2485 return Changed;
2486}
2487
2488bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2489 SIAtomicScope Scope,
2490 SIAtomicAddrSpace AddrSpace,
2491 Position Pos) const {
2492 if (!InsertCacheInv)
2493 return false;
2494
2495 MachineBasicBlock &MBB = *MI->getParent();
2496 DebugLoc DL = MI->getDebugLoc();
2497
2498 /// The scratch address space does not need the global memory cache
2499 /// to be flushed as all memory operations by the same thread are
2500 /// sequentially consistent, and no other thread can access scratch
2501 /// memory.
2502
2503 /// Other address spaces do not have a cache.
2504 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2505 return false;
2506
2508 switch (Scope) {
2509 case SIAtomicScope::SYSTEM:
2510 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2511 break;
2512 case SIAtomicScope::AGENT:
2513 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2514 break;
2515 case SIAtomicScope::CLUSTER:
2516 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2517 break;
2518 case SIAtomicScope::WORKGROUP:
2519 // GFX12.0:
2520 // In WGP mode the waves of a work-group can be executing on either CU of
2521 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2522 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2523 // so the L0 does not need to be invalidated.
2524 //
2525 // GFX12.5
2526 // TODO DOCS
2527 if (ST.isCuModeEnabled())
2528 return false;
2529
2530 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2531 break;
2532 case SIAtomicScope::WAVEFRONT:
2533 case SIAtomicScope::SINGLETHREAD:
2534 // No cache to invalidate.
2535 return false;
2536 default:
2537 llvm_unreachable("Unsupported synchronization scope");
2538 }
2539
2540 if (Pos == Position::AFTER)
2541 ++MI;
2542
2543 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2544
2545 if (Pos == Position::AFTER)
2546 --MI;
2547
2548 return true;
2549}
2550
2551bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2552 SIAtomicScope Scope,
2553 SIAtomicAddrSpace AddrSpace,
2554 bool IsCrossAddrSpaceOrdering,
2555 Position Pos) const {
2556 MachineBasicBlock &MBB = *MI->getParent();
2557 DebugLoc DL = MI->getDebugLoc();
2558
2559 // The scratch address space does not need the global memory cache
2560 // writeback as all memory operations by the same thread are
2561 // sequentially consistent, and no other thread can access scratch
2562 // memory.
2563
2564 // Other address spaces do not have a cache.
2565 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2566 return false;
2567
2568 if (Pos == Position::AFTER)
2569 ++MI;
2570
2571 // global_wb is only necessary at system scope for GFX12.0,
2572 // they're also necessary at device scope for GFX12.5.
2573 //
2574 // Emitting it for lower scopes is a slow no-op, so we omit it
2575 // for performance.
2576 switch (Scope) {
2577 case SIAtomicScope::SYSTEM:
2578 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2580 break;
2581 case SIAtomicScope::AGENT:
2582 // TODO DOCS
2583 if (ST.hasGFX1250Insts()) {
2584 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2586 }
2587 break;
2588 case SIAtomicScope::CLUSTER:
2589 case SIAtomicScope::WORKGROUP:
2590 // No WB necessary, but we still have to wait.
2591 break;
2592 case SIAtomicScope::WAVEFRONT:
2593 case SIAtomicScope::SINGLETHREAD:
2594 // No WB or wait necessary here.
2595 return false;
2596 default:
2597 llvm_unreachable("Unsupported synchronization scope");
2598 }
2599
2600 if (Pos == Position::AFTER)
2601 --MI;
2602
2603 // We always have to wait for previous memory operations (load/store) to
2604 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2605 // we of course need to wait for that as well.
2606 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2607 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2608
2609 return true;
2610}
2611
2612bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2613 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2614 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2615
2616 // Only handle load and store, not atomic read-modify-write instructions.
2617 assert(MI->mayLoad() ^ MI->mayStore());
2618
2619 // Only update load and store, not LLVM IR atomic read-modify-write
2620 // instructions. The latter are always marked as volatile so cannot sensibly
2621 // handle it as do not want to pessimize all atomics. Also they do not support
2622 // the nontemporal attribute.
2623 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2624
2625 bool Changed = false;
2626
2627 if (IsLastUse) {
2628 // Set last-use hint.
2629 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2630 } else if (IsNonTemporal) {
2631 // Set non-temporal hint for all cache levels.
2632 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2633 }
2634
2635 if (IsVolatile) {
2636 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2637
2638 // Ensure operation has completed at system scope to cause all volatile
2639 // operations to be visible outside the program in a global order. Do not
2640 // request cross address space as only the global address space can be
2641 // observable outside the program, so no need to cause a waitcnt for LDS
2642 // address space operations.
2643 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2644 Position::AFTER, AtomicOrdering::Unordered);
2645 }
2646
2647 return Changed;
2648}
2649
2650bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2651 assert(MI.mayStore() && "Not a Store inst");
2652 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2653 bool Changed = false;
2654
2655 // GFX12.5 only: xcnt wait is needed before flat and global atomics
2656 // stores/rmw.
2657 if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2658 MachineBasicBlock &MBB = *MI.getParent();
2659 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2660 Changed = true;
2661 }
2662
2663 // Remaining fixes do not apply to RMWs.
2664 if (IsRMW)
2665 return Changed;
2666
2667 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2668 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2669 return Changed;
2670 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2671
2672 // GFX12.0 only: Extra waits needed before system scope stores.
2673 if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
2674 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2675
2676 return Changed;
2677}
2678
2679bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2680 if (!ST.hasGFX1250Insts())
2681 return false;
2682
2683 // Cooperative atomics need to be SCOPE_DEV or higher.
2684 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2685 assert(CPol && "No CPol operand?");
2686 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2687 if (Scope < CPol::SCOPE_DEV)
2688 return setScope(MI, CPol::SCOPE_DEV);
2689 return false;
2690}
2691
2692bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2693 SIAtomicScope Scope,
2694 SIAtomicAddrSpace AddrSpace) const {
2695 bool Changed = false;
2696
2697 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2698 switch (Scope) {
2699 case SIAtomicScope::SYSTEM:
2700 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2701 break;
2702 case SIAtomicScope::AGENT:
2703 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2704 break;
2705 case SIAtomicScope::CLUSTER:
2706 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2707 break;
2708 case SIAtomicScope::WORKGROUP:
2709 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2710 // different CUs that access different L0s.
2711 if (!ST.isCuModeEnabled())
2712 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2713 break;
2714 case SIAtomicScope::WAVEFRONT:
2715 case SIAtomicScope::SINGLETHREAD:
2716 // No cache to bypass.
2717 break;
2718 default:
2719 llvm_unreachable("Unsupported synchronization scope");
2720 }
2721 }
2722
2723 // The scratch address space does not need the global memory caches
2724 // to be bypassed as all memory operations by the same thread are
2725 // sequentially consistent, and no other thread can access scratch
2726 // memory.
2727
2728 // Other address spaces do not have a cache.
2729
2730 return Changed;
2731}
2732
2733bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2734 if (AtomicPseudoMIs.empty())
2735 return false;
2736
2737 for (auto &MI : AtomicPseudoMIs)
2738 MI->eraseFromParent();
2739
2740 AtomicPseudoMIs.clear();
2741 return true;
2742}
2743
2744bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2746 assert(MI->mayLoad() && !MI->mayStore());
2747
2748 bool Changed = false;
2749
2750 if (MOI.isAtomic()) {
2751 const AtomicOrdering Order = MOI.getOrdering();
2752 if (Order == AtomicOrdering::Monotonic ||
2753 Order == AtomicOrdering::Acquire ||
2754 Order == AtomicOrdering::SequentiallyConsistent) {
2755 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2756 MOI.getOrderingAddrSpace());
2757 }
2758
2759 // Handle cooperative atomics after cache bypass step, as it may override
2760 // the scope of the instruction to a greater scope.
2761 if (MOI.isCooperative())
2762 Changed |= CC->handleCooperativeAtomic(*MI);
2763
2764 if (Order == AtomicOrdering::SequentiallyConsistent)
2765 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2766 SIMemOp::LOAD | SIMemOp::STORE,
2767 MOI.getIsCrossAddressSpaceOrdering(),
2768 Position::BEFORE, Order);
2769
2770 if (Order == AtomicOrdering::Acquire ||
2771 Order == AtomicOrdering::SequentiallyConsistent) {
2772 Changed |= CC->insertWait(
2773 MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2774 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2775 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2776 MOI.getOrderingAddrSpace(),
2777 Position::AFTER);
2778 }
2779
2780 return Changed;
2781 }
2782
2783 // Atomic instructions already bypass caches to the scope specified by the
2784 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2785 // instructions need additional treatment.
2786 Changed |= CC->enableVolatileAndOrNonTemporal(
2787 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2788 MOI.isNonTemporal(), MOI.isLastUse());
2789
2790 return Changed;
2791}
2792
2793bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2795 assert(!MI->mayLoad() && MI->mayStore());
2796
2797 bool Changed = false;
2798 // FIXME: Necessary hack because iterator can lose track of the store.
2799 MachineInstr &StoreMI = *MI;
2800
2801 if (MOI.isAtomic()) {
2802 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2803 MOI.getOrdering() == AtomicOrdering::Release ||
2804 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2805 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2806 MOI.getOrderingAddrSpace());
2807 }
2808
2809 // Handle cooperative atomics after cache bypass step, as it may override
2810 // the scope of the instruction to a greater scope.
2811 if (MOI.isCooperative())
2812 Changed |= CC->handleCooperativeAtomic(*MI);
2813
2814 if (MOI.getOrdering() == AtomicOrdering::Release ||
2815 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2816 Changed |= CC->insertRelease(MI, MOI.getScope(),
2817 MOI.getOrderingAddrSpace(),
2818 MOI.getIsCrossAddressSpaceOrdering(),
2819 Position::BEFORE);
2820
2821 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2822 return Changed;
2823 }
2824
2825 // Atomic instructions already bypass caches to the scope specified by the
2826 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2827 // need additional treatment.
2828 Changed |= CC->enableVolatileAndOrNonTemporal(
2829 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2830 MOI.isNonTemporal());
2831
2832 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2833 // instruction field, do not confuse it with atomic scope.
2834 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2835 return Changed;
2836}
2837
2838bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2840 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2841
2842 AtomicPseudoMIs.push_back(MI);
2843 bool Changed = false;
2844
2845 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2846
2847 if (MOI.isAtomic()) {
2848 const AtomicOrdering Order = MOI.getOrdering();
2849 if (Order == AtomicOrdering::Acquire) {
2850 Changed |= CC->insertWait(
2851 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2852 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2853 }
2854
2855 if (Order == AtomicOrdering::Release ||
2856 Order == AtomicOrdering::AcquireRelease ||
2857 Order == AtomicOrdering::SequentiallyConsistent)
2858 /// TODO: This relies on a barrier always generating a waitcnt
2859 /// for LDS to ensure it is not reordered with the completion of
2860 /// the proceeding LDS operations. If barrier had a memory
2861 /// ordering and memory scope, then library does not need to
2862 /// generate a fence. Could add support in this file for
2863 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2864 /// adding S_WAITCNT before a S_BARRIER.
2865 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2866 MOI.getIsCrossAddressSpaceOrdering(),
2867 Position::BEFORE);
2868
2869 // TODO: If both release and invalidate are happening they could be combined
2870 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2871 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2872 // track cache invalidate and write back instructions.
2873
2874 if (Order == AtomicOrdering::Acquire ||
2875 Order == AtomicOrdering::AcquireRelease ||
2876 Order == AtomicOrdering::SequentiallyConsistent)
2877 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2878 Position::BEFORE);
2879
2880 return Changed;
2881 }
2882
2883 return Changed;
2884}
2885
2886bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2888 assert(MI->mayLoad() && MI->mayStore());
2889
2890 bool Changed = false;
2891 MachineInstr &RMWMI = *MI;
2892
2893 if (MOI.isAtomic()) {
2894 const AtomicOrdering Order = MOI.getOrdering();
2895 if (Order == AtomicOrdering::Monotonic ||
2896 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2897 Order == AtomicOrdering::AcquireRelease ||
2898 Order == AtomicOrdering::SequentiallyConsistent) {
2899 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2900 MOI.getInstrAddrSpace());
2901 }
2902
2903 if (Order == AtomicOrdering::Release ||
2904 Order == AtomicOrdering::AcquireRelease ||
2905 Order == AtomicOrdering::SequentiallyConsistent ||
2906 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2907 Changed |= CC->insertRelease(MI, MOI.getScope(),
2908 MOI.getOrderingAddrSpace(),
2909 MOI.getIsCrossAddressSpaceOrdering(),
2910 Position::BEFORE);
2911
2912 if (Order == AtomicOrdering::Acquire ||
2913 Order == AtomicOrdering::AcquireRelease ||
2914 Order == AtomicOrdering::SequentiallyConsistent ||
2915 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2916 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2917 Changed |= CC->insertWait(
2918 MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2919 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2920 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2921 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2922 MOI.getOrderingAddrSpace(),
2923 Position::AFTER);
2924 }
2925
2926 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2927 return Changed;
2928 }
2929
2930 return Changed;
2931}
2932
2933bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2934 const MachineModuleInfo &MMI =
2935 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2936 return SIMemoryLegalizer(MMI).run(MF);
2937}
2938
2939PreservedAnalyses
2943 .getCachedResult<MachineModuleAnalysis>(
2944 *MF.getFunction().getParent());
2945 assert(MMI && "MachineModuleAnalysis must be available");
2946 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2947 return PreservedAnalyses::all();
2949}
2950
2951bool SIMemoryLegalizer::run(MachineFunction &MF) {
2952 bool Changed = false;
2953
2954 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2955 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2956 CC = SICacheControl::create(ST);
2957
2958 for (auto &MBB : MF) {
2959 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2960
2961 // Unbundle instructions after the post-RA scheduler.
2962 if (MI->isBundle() && MI->mayLoadOrStore()) {
2963 MachineBasicBlock::instr_iterator II(MI->getIterator());
2964 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2965 I != E && I->isBundledWithPred(); ++I) {
2966 I->unbundleFromPred();
2967 for (MachineOperand &MO : I->operands())
2968 if (MO.isReg())
2969 MO.setIsInternalRead(false);
2970 }
2971
2972 MI->eraseFromParent();
2973 MI = II->getIterator();
2974 }
2975
2976 if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
2977 Changed |= CC->insertBarrierStart(MI);
2978 continue;
2979 }
2980
2981 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2982 continue;
2983
2984 if (const auto &MOI = MOA.getLoadInfo(MI))
2985 Changed |= expandLoad(*MOI, MI);
2986 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2987 Changed |= expandStore(*MOI, MI);
2988 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2989 Changed |= expandAtomicFence(*MOI, MI);
2990 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2991 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2992 }
2993 }
2994
2995 Changed |= removeAtomicPseudoMIs();
2996 return Changed;
2997}
2998
2999INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
3000
3001char SIMemoryLegalizerLegacy::ID = 0;
3002char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
3003
3005 return new SIMemoryLegalizerLegacy();
3006}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define I(x, y, z)
Definition MD5.cpp:58
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isAtomicRet(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
DWARFExpression::Operation Op
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()