LLVM 22.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
30
31using namespace llvm;
32using namespace llvm::AMDGPU;
33
34#define DEBUG_TYPE "si-memory-legalizer"
35#define PASS_NAME "SI Memory Legalizer"
36
38 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
39 cl::desc("Use this to skip inserting cache invalidating instructions."));
40
41namespace {
42
44
45/// Memory operation flags. Can be ORed together.
46enum class SIMemOp {
47 NONE = 0u,
48 LOAD = 1u << 0,
49 STORE = 1u << 1,
50 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
51};
52
53/// Position to insert a new instruction relative to an existing
54/// instruction.
55enum class Position {
56 BEFORE,
57 AFTER
58};
59
60/// The atomic synchronization scopes supported by the AMDGPU target.
61enum class SIAtomicScope {
62 NONE,
63 SINGLETHREAD,
64 WAVEFRONT,
65 WORKGROUP,
66 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
67 AGENT,
68 SYSTEM
69};
70
71/// The distinct address spaces supported by the AMDGPU target for
72/// atomic memory operation. Can be ORed together.
73enum class SIAtomicAddrSpace {
74 NONE = 0u,
75 GLOBAL = 1u << 0,
76 LDS = 1u << 1,
77 SCRATCH = 1u << 2,
78 GDS = 1u << 3,
79 OTHER = 1u << 4,
80
81 /// The address spaces that can be accessed by a FLAT instruction.
82 FLAT = GLOBAL | LDS | SCRATCH,
83
84 /// The address spaces that support atomic instructions.
85 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
86
87 /// All address spaces.
88 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
89
90 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
91};
92
93class SIMemOpInfo final {
94private:
95
96 friend class SIMemOpAccess;
97
98 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
99 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
100 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
101 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
102 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
103 bool IsCrossAddressSpaceOrdering = false;
104 bool IsVolatile = false;
105 bool IsNonTemporal = false;
106 bool IsLastUse = false;
107 bool IsCooperative = false;
108
109 // TODO: Should we assume Cooperative=true if no MMO is present?
110 SIMemOpInfo(
111 const GCNSubtarget &ST,
112 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
113 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
114 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
115 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
116 bool IsCrossAddressSpaceOrdering = true,
117 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
118 bool IsVolatile = false, bool IsNonTemporal = false,
119 bool IsLastUse = false, bool IsCooperative = false)
120 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
121 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
122 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
123 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
124 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
125
126 if (Ordering == AtomicOrdering::NotAtomic) {
127 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
128 assert(Scope == SIAtomicScope::NONE &&
129 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
130 !IsCrossAddressSpaceOrdering &&
131 FailureOrdering == AtomicOrdering::NotAtomic);
132 return;
133 }
134
135 assert(Scope != SIAtomicScope::NONE &&
136 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
137 SIAtomicAddrSpace::NONE &&
138 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
139 SIAtomicAddrSpace::NONE);
140
141 // There is also no cross address space ordering if the ordering
142 // address space is the same as the instruction address space and
143 // only contains a single address space.
144 if ((OrderingAddrSpace == InstrAddrSpace) &&
145 isPowerOf2_32(uint32_t(InstrAddrSpace)))
146 this->IsCrossAddressSpaceOrdering = false;
147
148 // Limit the scope to the maximum supported by the instruction's address
149 // spaces.
150 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
151 SIAtomicAddrSpace::NONE) {
152 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
153 } else if ((InstrAddrSpace &
154 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
155 SIAtomicAddrSpace::NONE) {
156 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
157 } else if ((InstrAddrSpace &
158 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
159 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
160 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
161 }
162
163 // On targets that have no concept of a workgroup cluster, use
164 // AGENT scope as a conservatively correct alternative.
165 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
166 this->Scope = SIAtomicScope::AGENT;
167 }
168
169public:
170 /// \returns Atomic synchronization scope of the machine instruction used to
171 /// create this SIMemOpInfo.
172 SIAtomicScope getScope() const {
173 return Scope;
174 }
175
176 /// \returns Ordering constraint of the machine instruction used to
177 /// create this SIMemOpInfo.
178 AtomicOrdering getOrdering() const {
179 return Ordering;
180 }
181
182 /// \returns Failure ordering constraint of the machine instruction used to
183 /// create this SIMemOpInfo.
184 AtomicOrdering getFailureOrdering() const {
185 return FailureOrdering;
186 }
187
188 /// \returns The address spaces be accessed by the machine
189 /// instruction used to create this SIMemOpInfo.
190 SIAtomicAddrSpace getInstrAddrSpace() const {
191 return InstrAddrSpace;
192 }
193
194 /// \returns The address spaces that must be ordered by the machine
195 /// instruction used to create this SIMemOpInfo.
196 SIAtomicAddrSpace getOrderingAddrSpace() const {
197 return OrderingAddrSpace;
198 }
199
200 /// \returns Return true iff memory ordering of operations on
201 /// different address spaces is required.
202 bool getIsCrossAddressSpaceOrdering() const {
203 return IsCrossAddressSpaceOrdering;
204 }
205
206 /// \returns True if memory access of the machine instruction used to
207 /// create this SIMemOpInfo is volatile, false otherwise.
208 bool isVolatile() const {
209 return IsVolatile;
210 }
211
212 /// \returns True if memory access of the machine instruction used to
213 /// create this SIMemOpInfo is nontemporal, false otherwise.
214 bool isNonTemporal() const {
215 return IsNonTemporal;
216 }
217
218 /// \returns True if memory access of the machine instruction used to
219 /// create this SIMemOpInfo is last use, false otherwise.
220 bool isLastUse() const { return IsLastUse; }
221
222 /// \returns True if this is a cooperative load or store atomic.
223 bool isCooperative() const { return IsCooperative; }
224
225 /// \returns True if ordering constraint of the machine instruction used to
226 /// create this SIMemOpInfo is unordered or higher, false otherwise.
227 bool isAtomic() const {
228 return Ordering != AtomicOrdering::NotAtomic;
229 }
230
231};
232
233class SIMemOpAccess final {
234private:
235 const AMDGPUMachineModuleInfo *MMI = nullptr;
236 const GCNSubtarget &ST;
237
238 /// Reports unsupported message \p Msg for \p MI to LLVM context.
239 void reportUnsupported(const MachineBasicBlock::iterator &MI,
240 const char *Msg) const;
241
242 /// Inspects the target synchronization scope \p SSID and determines
243 /// the SI atomic scope it corresponds to, the address spaces it
244 /// covers, and whether the memory ordering applies between address
245 /// spaces.
246 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
247 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
248
249 /// \return Return a bit set of the address spaces accessed by \p AS.
250 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
251
252 /// \returns Info constructed from \p MI, which has at least machine memory
253 /// operand.
254 std::optional<SIMemOpInfo>
255 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
256
257public:
258 /// Construct class to support accessing the machine memory operands
259 /// of instructions in the machine function \p MF.
260 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
261
262 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
263 std::optional<SIMemOpInfo>
265
266 /// \returns Store info if \p MI is a store operation, "std::nullopt"
267 /// otherwise.
268 std::optional<SIMemOpInfo>
269 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
270
271 /// \returns Atomic fence info if \p MI is an atomic fence operation,
272 /// "std::nullopt" otherwise.
273 std::optional<SIMemOpInfo>
274 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
275
276 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
277 /// rmw operation, "std::nullopt" otherwise.
278 std::optional<SIMemOpInfo>
279 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
280};
281
282class SICacheControl {
283protected:
284
285 /// AMDGPU subtarget info.
286 const GCNSubtarget &ST;
287
288 /// Instruction info.
289 const SIInstrInfo *TII = nullptr;
290
291 IsaVersion IV;
292
293 /// Whether to insert cache invalidating instructions.
294 bool InsertCacheInv;
295
296 SICacheControl(const GCNSubtarget &ST);
297
298 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
299 /// \returns Returns true if \p MI is modified, false otherwise.
300 bool enableNamedBit(const MachineBasicBlock::iterator MI,
301 AMDGPU::CPol::CPol Bit) const;
302
303 /// Check if any atomic operation on AS can affect memory accessible via the
304 /// global address space.
305 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
306
307public:
308
309 /// Create a cache control for the subtarget \p ST.
310 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
311
312 /// Update \p MI memory load instruction to bypass any caches up to
313 /// the \p Scope memory scope for address spaces \p
314 /// AddrSpace. Return true iff the instruction was modified.
315 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
316 SIAtomicScope Scope,
317 SIAtomicAddrSpace AddrSpace) const = 0;
318
319 /// Update \p MI memory store instruction to bypass any caches up to
320 /// the \p Scope memory scope for address spaces \p
321 /// AddrSpace. Return true iff the instruction was modified.
322 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
323 SIAtomicScope Scope,
324 SIAtomicAddrSpace AddrSpace) const = 0;
325
326 /// Update \p MI memory read-modify-write instruction to bypass any caches up
327 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
328 /// iff the instruction was modified.
329 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
330 SIAtomicScope Scope,
331 SIAtomicAddrSpace AddrSpace) const = 0;
332
333 /// Update \p MI memory instruction of kind \p Op associated with address
334 /// spaces \p AddrSpace to indicate it is volatile and/or
335 /// nontemporal/last-use. Return true iff the instruction was modified.
336 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
337 SIAtomicAddrSpace AddrSpace,
338 SIMemOp Op, bool IsVolatile,
339 bool IsNonTemporal,
340 bool IsLastUse = false) const = 0;
341
342 /// Add final touches to a `mayStore` instruction \p MI, which may be a
343 /// Store or RMW instruction.
344 /// FIXME: This takes a MI because iterators aren't handled properly. When
345 /// this is called, they often point to entirely different insts. Thus we back
346 /// up the inst early and pass it here instead.
347 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
348 return false;
349 };
350
351 /// Handle cooperative load/store atomics.
352 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
354 "cooperative atomics are not available on this architecture");
355 }
356
357 /// Inserts any necessary instructions at position \p Pos relative
358 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
359 /// \p Op associated with address spaces \p AddrSpace have completed. Used
360 /// between memory instructions to enforce the order they become visible as
361 /// observed by other memory instructions executing in memory scope \p Scope.
362 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
363 /// address spaces. Returns true iff any instructions inserted.
364 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
365 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
366 bool IsCrossAddrSpaceOrdering, Position Pos,
367 AtomicOrdering Order) const = 0;
368
369 /// Inserts any necessary instructions at position \p Pos relative to
370 /// instruction \p MI to ensure any subsequent memory instructions of this
371 /// thread with address spaces \p AddrSpace will observe the previous memory
372 /// operations by any thread for memory scopes up to memory scope \p Scope .
373 /// Returns true iff any instructions inserted.
374 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
375 SIAtomicScope Scope,
376 SIAtomicAddrSpace AddrSpace,
377 Position Pos) const = 0;
378
379 /// Inserts any necessary instructions at position \p Pos relative to
380 /// instruction \p MI to ensure previous memory instructions by this thread
381 /// with address spaces \p AddrSpace have completed and can be observed by
382 /// subsequent memory instructions by any thread executing in memory scope \p
383 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
384 /// between address spaces. Returns true iff any instructions inserted.
385 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
386 SIAtomicScope Scope,
387 SIAtomicAddrSpace AddrSpace,
388 bool IsCrossAddrSpaceOrdering,
389 Position Pos) const = 0;
390
391 /// Inserts any necessary instructions before the barrier start instruction
392 /// \p MI in order to support pairing of barriers and fences.
393 virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
394 return false;
395 };
396
397 /// Virtual destructor to allow derivations to be deleted.
398 virtual ~SICacheControl() = default;
399};
400
401class SIGfx6CacheControl : public SICacheControl {
402protected:
403
404 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
405 /// is modified, false otherwise.
406 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
407 return enableNamedBit(MI, AMDGPU::CPol::GLC);
408 }
409
410 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
411 /// is modified, false otherwise.
412 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
413 return enableNamedBit(MI, AMDGPU::CPol::SLC);
414 }
415
416public:
417
418 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
419
420 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
421 SIAtomicScope Scope,
422 SIAtomicAddrSpace AddrSpace) const override;
423
424 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
425 SIAtomicScope Scope,
426 SIAtomicAddrSpace AddrSpace) const override;
427
428 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
429 SIAtomicScope Scope,
430 SIAtomicAddrSpace AddrSpace) const override;
431
432 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
433 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
434 bool IsVolatile, bool IsNonTemporal,
435 bool IsLastUse) const override;
436
437 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
438 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
439 bool IsCrossAddrSpaceOrdering, Position Pos,
440 AtomicOrdering Order) const override;
441
442 bool insertAcquire(MachineBasicBlock::iterator &MI,
443 SIAtomicScope Scope,
444 SIAtomicAddrSpace AddrSpace,
445 Position Pos) const override;
446
447 bool insertRelease(MachineBasicBlock::iterator &MI,
448 SIAtomicScope Scope,
449 SIAtomicAddrSpace AddrSpace,
450 bool IsCrossAddrSpaceOrdering,
451 Position Pos) const override;
452};
453
454class SIGfx7CacheControl : public SIGfx6CacheControl {
455public:
456
457 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
458
459 bool insertAcquire(MachineBasicBlock::iterator &MI,
460 SIAtomicScope Scope,
461 SIAtomicAddrSpace AddrSpace,
462 Position Pos) const override;
463
464};
465
466class SIGfx90ACacheControl : public SIGfx7CacheControl {
467public:
468
469 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
470
471 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
472 SIAtomicScope Scope,
473 SIAtomicAddrSpace AddrSpace) const override;
474
475 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
476 SIAtomicScope Scope,
477 SIAtomicAddrSpace AddrSpace) const override;
478
479 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
480 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
481 bool IsVolatile, bool IsNonTemporal,
482 bool IsLastUse) const override;
483
484 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
485 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
486 bool IsCrossAddrSpaceOrdering, Position Pos,
487 AtomicOrdering Order) const override;
488
489 bool insertAcquire(MachineBasicBlock::iterator &MI,
490 SIAtomicScope Scope,
491 SIAtomicAddrSpace AddrSpace,
492 Position Pos) const override;
493
494 bool insertRelease(MachineBasicBlock::iterator &MI,
495 SIAtomicScope Scope,
496 SIAtomicAddrSpace AddrSpace,
497 bool IsCrossAddrSpaceOrdering,
498 Position Pos) const override;
499};
500
501class SIGfx940CacheControl : public SIGfx90ACacheControl {
502protected:
503
504 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
505 /// is modified, false otherwise.
506 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
507 return enableNamedBit(MI, AMDGPU::CPol::SC0);
508 }
509
510 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
511 /// is modified, false otherwise.
512 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
513 return enableNamedBit(MI, AMDGPU::CPol::SC1);
514 }
515
516 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
517 /// is modified, false otherwise.
518 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
519 return enableNamedBit(MI, AMDGPU::CPol::NT);
520 }
521
522public:
523 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
524
525 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
526 SIAtomicScope Scope,
527 SIAtomicAddrSpace AddrSpace) const override;
528
529 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
530 SIAtomicScope Scope,
531 SIAtomicAddrSpace AddrSpace) const override;
532
533 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
534 SIAtomicScope Scope,
535 SIAtomicAddrSpace AddrSpace) const override;
536
537 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
538 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
539 bool IsVolatile, bool IsNonTemporal,
540 bool IsLastUse) const override;
541
542 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
543 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
544
545 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
546 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
547 Position Pos) const override;
548};
549
550class SIGfx10CacheControl : public SIGfx7CacheControl {
551protected:
552
553 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
554 /// is modified, false otherwise.
555 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
556 return enableNamedBit(MI, AMDGPU::CPol::DLC);
557 }
558
559public:
560
561 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
562
563 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
564 SIAtomicScope Scope,
565 SIAtomicAddrSpace AddrSpace) const override;
566
567 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
568 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
569 bool IsVolatile, bool IsNonTemporal,
570 bool IsLastUse) const override;
571
572 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
573 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
574 bool IsCrossAddrSpaceOrdering, Position Pos,
575 AtomicOrdering Order) const override;
576
577 bool insertAcquire(MachineBasicBlock::iterator &MI,
578 SIAtomicScope Scope,
579 SIAtomicAddrSpace AddrSpace,
580 Position Pos) const override;
581
582 bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
583};
584
585class SIGfx11CacheControl : public SIGfx10CacheControl {
586public:
587 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
588
589 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
590 SIAtomicScope Scope,
591 SIAtomicAddrSpace AddrSpace) const override;
592
593 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
594 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
595 bool IsVolatile, bool IsNonTemporal,
596 bool IsLastUse) const override;
597};
598
599class SIGfx12CacheControl : public SIGfx11CacheControl {
600protected:
601 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
602 // \returns Returns true if \p MI is modified, false otherwise.
603 bool setTH(const MachineBasicBlock::iterator MI,
605 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
606 // MI. \returns Returns true if \p MI is modified, false otherwise.
607 bool setScope(const MachineBasicBlock::iterator MI,
609
610 // Stores with system scope (SCOPE_SYS) need to wait for:
611 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
612 // - non-returning-atomics - wait for STORECNT==0
613 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
614 // since it does not distinguish atomics-with-return from regular stores.
615 // There is no need to wait if memory is cached (mtype != UC).
616 bool
617 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
618
619 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
620 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
621
622public:
623 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
624 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
625 // the behavior is the same if assuming GFX12.0 in CU mode.
626 assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
627 }
628
629 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
630 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
631 bool IsCrossAddrSpaceOrdering, Position Pos,
632 AtomicOrdering Order) const override;
633
634 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
635 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
636
637 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
638 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
639 bool IsVolatile, bool IsNonTemporal,
640 bool IsLastUse) const override;
641
642 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
643
644 virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
645
646 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
647 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
648 Position Pos) const override;
649
650 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
651 SIAtomicScope Scope,
652 SIAtomicAddrSpace AddrSpace) const override {
653 return setAtomicScope(MI, Scope, AddrSpace);
654 }
655
656 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
657 SIAtomicScope Scope,
658 SIAtomicAddrSpace AddrSpace) const override {
659 return setAtomicScope(MI, Scope, AddrSpace);
660 }
661
662 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
663 SIAtomicScope Scope,
664 SIAtomicAddrSpace AddrSpace) const override {
665 return setAtomicScope(MI, Scope, AddrSpace);
666 }
667};
668
669class SIMemoryLegalizer final {
670private:
671 const MachineModuleInfo &MMI;
672 /// Cache Control.
673 std::unique_ptr<SICacheControl> CC = nullptr;
674
675 /// List of atomic pseudo instructions.
676 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
677
678 /// Return true iff instruction \p MI is a atomic instruction that
679 /// returns a result.
680 bool isAtomicRet(const MachineInstr &MI) const {
682 }
683
684 /// Removes all processed atomic pseudo instructions from the current
685 /// function. Returns true if current function is modified, false otherwise.
686 bool removeAtomicPseudoMIs();
687
688 /// Expands load operation \p MI. Returns true if instructions are
689 /// added/deleted or \p MI is modified, false otherwise.
690 bool expandLoad(const SIMemOpInfo &MOI,
692 /// Expands store operation \p MI. Returns true if instructions are
693 /// added/deleted or \p MI is modified, false otherwise.
694 bool expandStore(const SIMemOpInfo &MOI,
696 /// Expands atomic fence operation \p MI. Returns true if
697 /// instructions are added/deleted or \p MI is modified, false otherwise.
698 bool expandAtomicFence(const SIMemOpInfo &MOI,
700 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
701 /// instructions are added/deleted or \p MI is modified, false otherwise.
702 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
704
705public:
706 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
707 bool run(MachineFunction &MF);
708};
709
710class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
711public:
712 static char ID;
713
714 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
715
716 void getAnalysisUsage(AnalysisUsage &AU) const override {
717 AU.setPreservesCFG();
719 }
720
721 StringRef getPassName() const override {
722 return PASS_NAME;
723 }
724
725 bool runOnMachineFunction(MachineFunction &MF) override;
726};
727
728static const StringMap<SIAtomicAddrSpace> ASNames = {{
729 {"global", SIAtomicAddrSpace::GLOBAL},
730 {"local", SIAtomicAddrSpace::LDS},
731}};
732
733void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
734 const MachineFunction *MF = MI.getMF();
735 const Function &Fn = MF->getFunction();
737 raw_svector_ostream OS(Str);
738 OS << "unknown address space '" << AS << "'; expected one of ";
740 for (const auto &[Name, Val] : ASNames)
741 OS << LS << '\'' << Name << '\'';
742 Fn.getContext().diagnose(
743 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
744}
745
746/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
747/// If this tag isn't present, or if it has no meaningful values, returns
748/// \p none, otherwise returns the address spaces specified by the MD.
749static std::optional<SIAtomicAddrSpace>
750getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
751 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
752
753 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
754 if (!MMRA)
755 return std::nullopt;
756
757 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
758 for (const auto &[Prefix, Suffix] : MMRA) {
759 if (Prefix != FenceASPrefix)
760 continue;
761
762 if (auto It = ASNames.find(Suffix); It != ASNames.end())
763 Result |= It->second;
764 else
765 diagnoseUnknownMMRAASName(MI, Suffix);
766 }
767
768 if (Result == SIAtomicAddrSpace::NONE)
769 return std::nullopt;
770
771 return Result;
772}
773
774} // end anonymous namespace
775
776void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
777 const char *Msg) const {
778 const Function &Func = MI->getParent()->getParent()->getFunction();
779 Func.getContext().diagnose(
780 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
781}
782
783std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
784SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
785 SIAtomicAddrSpace InstrAddrSpace) const {
786 if (SSID == SyncScope::System)
787 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
788 if (SSID == MMI->getAgentSSID())
789 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
790 if (SSID == MMI->getClusterSSID())
791 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
792 if (SSID == MMI->getWorkgroupSSID())
793 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
794 true);
795 if (SSID == MMI->getWavefrontSSID())
796 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
797 true);
798 if (SSID == SyncScope::SingleThread)
799 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
800 true);
801 if (SSID == MMI->getSystemOneAddressSpaceSSID())
802 return std::tuple(SIAtomicScope::SYSTEM,
803 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
804 if (SSID == MMI->getAgentOneAddressSpaceSSID())
805 return std::tuple(SIAtomicScope::AGENT,
806 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
807 if (SSID == MMI->getClusterOneAddressSpaceSSID())
808 return std::tuple(SIAtomicScope::CLUSTER,
809 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
810 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
811 return std::tuple(SIAtomicScope::WORKGROUP,
812 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
813 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
814 return std::tuple(SIAtomicScope::WAVEFRONT,
815 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
816 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
817 return std::tuple(SIAtomicScope::SINGLETHREAD,
818 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
819 return std::nullopt;
820}
821
822SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
823 if (AS == AMDGPUAS::FLAT_ADDRESS)
824 return SIAtomicAddrSpace::FLAT;
825 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
826 return SIAtomicAddrSpace::GLOBAL;
827 if (AS == AMDGPUAS::LOCAL_ADDRESS)
828 return SIAtomicAddrSpace::LDS;
830 return SIAtomicAddrSpace::SCRATCH;
831 if (AS == AMDGPUAS::REGION_ADDRESS)
832 return SIAtomicAddrSpace::GDS;
833
834 return SIAtomicAddrSpace::OTHER;
835}
836
837SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
838 const GCNSubtarget &ST)
839 : MMI(&MMI_), ST(ST) {}
840
841std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
842 const MachineBasicBlock::iterator &MI) const {
843 assert(MI->getNumMemOperands() > 0);
844
846 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
847 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
848 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
849 bool IsNonTemporal = true;
850 bool IsVolatile = false;
851 bool IsLastUse = false;
852 bool IsCooperative = false;
853
854 // Validator should check whether or not MMOs cover the entire set of
855 // locations accessed by the memory instruction.
856 for (const auto &MMO : MI->memoperands()) {
857 IsNonTemporal &= MMO->isNonTemporal();
858 IsVolatile |= MMO->isVolatile();
859 IsLastUse |= MMO->getFlags() & MOLastUse;
860 IsCooperative |= MMO->getFlags() & MOCooperative;
861 InstrAddrSpace |=
862 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
863 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
864 if (OpOrdering != AtomicOrdering::NotAtomic) {
865 const auto &IsSyncScopeInclusion =
866 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
867 if (!IsSyncScopeInclusion) {
868 reportUnsupported(MI,
869 "Unsupported non-inclusive atomic synchronization scope");
870 return std::nullopt;
871 }
872
873 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
874 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
875 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
876 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
877 FailureOrdering =
878 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
879 }
880 }
881
882 SIAtomicScope Scope = SIAtomicScope::NONE;
883 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
884 bool IsCrossAddressSpaceOrdering = false;
885 if (Ordering != AtomicOrdering::NotAtomic) {
886 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
887 if (!ScopeOrNone) {
888 reportUnsupported(MI, "Unsupported atomic synchronization scope");
889 return std::nullopt;
890 }
891 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
892 *ScopeOrNone;
893 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
894 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
895 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
896 reportUnsupported(MI, "Unsupported atomic address space");
897 return std::nullopt;
898 }
899 }
900 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
901 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
902 IsNonTemporal, IsLastUse, IsCooperative);
903}
904
905std::optional<SIMemOpInfo>
906SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
907 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
908
909 if (!(MI->mayLoad() && !MI->mayStore()))
910 return std::nullopt;
911
912 // Be conservative if there are no memory operands.
913 if (MI->getNumMemOperands() == 0)
914 return SIMemOpInfo(ST);
915
916 return constructFromMIWithMMO(MI);
917}
918
919std::optional<SIMemOpInfo>
920SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
921 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
922
923 if (!(!MI->mayLoad() && MI->mayStore()))
924 return std::nullopt;
925
926 // Be conservative if there are no memory operands.
927 if (MI->getNumMemOperands() == 0)
928 return SIMemOpInfo(ST);
929
930 return constructFromMIWithMMO(MI);
931}
932
933std::optional<SIMemOpInfo>
934SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
935 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
936
937 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
938 return std::nullopt;
939
941 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
942
943 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
944 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
945 if (!ScopeOrNone) {
946 reportUnsupported(MI, "Unsupported atomic synchronization scope");
947 return std::nullopt;
948 }
949
950 SIAtomicScope Scope = SIAtomicScope::NONE;
951 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
952 bool IsCrossAddressSpaceOrdering = false;
953 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
954 *ScopeOrNone;
955
956 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
957 // We currently expect refineOrderingAS to be the only place that
958 // can refine the AS ordered by the fence.
959 // If that changes, we need to review the semantics of that function
960 // in case it needs to preserve certain address spaces.
961 reportUnsupported(MI, "Unsupported atomic address space");
962 return std::nullopt;
963 }
964
965 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
966 if (SynchronizeAS)
967 OrderingAddrSpace = *SynchronizeAS;
968
969 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
970 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
971 AtomicOrdering::NotAtomic);
972}
973
974std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
975 const MachineBasicBlock::iterator &MI) const {
976 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
977
978 if (!(MI->mayLoad() && MI->mayStore()))
979 return std::nullopt;
980
981 // Be conservative if there are no memory operands.
982 if (MI->getNumMemOperands() == 0)
983 return SIMemOpInfo(ST);
984
985 return constructFromMIWithMMO(MI);
986}
987
988SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
989 TII = ST.getInstrInfo();
990 IV = getIsaVersion(ST.getCPU());
991 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
992}
993
994bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
995 AMDGPU::CPol::CPol Bit) const {
996 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
997 if (!CPol)
998 return false;
999
1000 CPol->setImm(CPol->getImm() | Bit);
1001 return true;
1002}
1003
1004bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
1005 assert((!ST.hasGloballyAddressableScratch() ||
1006 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
1007 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
1008 "scratch instructions should already be replaced by flat "
1009 "instructions if GloballyAddressableScratch is enabled");
1010 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
1011}
1012
1013/* static */
1014std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
1015 GCNSubtarget::Generation Generation = ST.getGeneration();
1016 if (ST.hasGFX940Insts())
1017 return std::make_unique<SIGfx940CacheControl>(ST);
1018 if (ST.hasGFX90AInsts())
1019 return std::make_unique<SIGfx90ACacheControl>(ST);
1020 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1021 return std::make_unique<SIGfx6CacheControl>(ST);
1022 if (Generation < AMDGPUSubtarget::GFX10)
1023 return std::make_unique<SIGfx7CacheControl>(ST);
1024 if (Generation < AMDGPUSubtarget::GFX11)
1025 return std::make_unique<SIGfx10CacheControl>(ST);
1026 if (Generation < AMDGPUSubtarget::GFX12)
1027 return std::make_unique<SIGfx11CacheControl>(ST);
1028 return std::make_unique<SIGfx12CacheControl>(ST);
1029}
1030
1031bool SIGfx6CacheControl::enableLoadCacheBypass(
1033 SIAtomicScope Scope,
1034 SIAtomicAddrSpace AddrSpace) const {
1035 assert(MI->mayLoad() && !MI->mayStore());
1036 bool Changed = false;
1037
1038 if (canAffectGlobalAddrSpace(AddrSpace)) {
1039 switch (Scope) {
1040 case SIAtomicScope::SYSTEM:
1041 case SIAtomicScope::AGENT:
1042 // Set L1 cache policy to MISS_EVICT.
1043 // Note: there is no L2 cache bypass policy at the ISA level.
1044 Changed |= enableGLCBit(MI);
1045 break;
1046 case SIAtomicScope::WORKGROUP:
1047 case SIAtomicScope::WAVEFRONT:
1048 case SIAtomicScope::SINGLETHREAD:
1049 // No cache to bypass.
1050 break;
1051 default:
1052 llvm_unreachable("Unsupported synchronization scope");
1053 }
1054 }
1055
1056 /// The scratch address space does not need the global memory caches
1057 /// to be bypassed as all memory operations by the same thread are
1058 /// sequentially consistent, and no other thread can access scratch
1059 /// memory.
1060
1061 /// Other address spaces do not have a cache.
1062
1063 return Changed;
1064}
1065
1066bool SIGfx6CacheControl::enableStoreCacheBypass(
1068 SIAtomicScope Scope,
1069 SIAtomicAddrSpace AddrSpace) const {
1070 assert(!MI->mayLoad() && MI->mayStore());
1071 bool Changed = false;
1072
1073 /// The L1 cache is write through so does not need to be bypassed. There is no
1074 /// bypass control for the L2 cache at the isa level.
1075
1076 return Changed;
1077}
1078
1079bool SIGfx6CacheControl::enableRMWCacheBypass(
1081 SIAtomicScope Scope,
1082 SIAtomicAddrSpace AddrSpace) const {
1083 assert(MI->mayLoad() && MI->mayStore());
1084 bool Changed = false;
1085
1086 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1087 /// bypassed, and the GLC bit is instead used to indicate if they are
1088 /// return or no-return.
1089 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1090
1091 return Changed;
1092}
1093
1094bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1095 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1096 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1097 // Only handle load and store, not atomic read-modify-write insructions. The
1098 // latter use glc to indicate if the atomic returns a result and so must not
1099 // be used for cache control.
1100 assert(MI->mayLoad() ^ MI->mayStore());
1101
1102 // Only update load and store, not LLVM IR atomic read-modify-write
1103 // instructions. The latter are always marked as volatile so cannot sensibly
1104 // handle it as do not want to pessimize all atomics. Also they do not support
1105 // the nontemporal attribute.
1106 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1107
1108 bool Changed = false;
1109
1110 if (IsVolatile) {
1111 // Set L1 cache policy to be MISS_EVICT for load instructions
1112 // and MISS_LRU for store instructions.
1113 // Note: there is no L2 cache bypass policy at the ISA level.
1114 if (Op == SIMemOp::LOAD)
1115 Changed |= enableGLCBit(MI);
1116
1117 // Ensure operation has completed at system scope to cause all volatile
1118 // operations to be visible outside the program in a global order. Do not
1119 // request cross address space as only the global address space can be
1120 // observable outside the program, so no need to cause a waitcnt for LDS
1121 // address space operations.
1122 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1123 Position::AFTER, AtomicOrdering::Unordered);
1124
1125 return Changed;
1126 }
1127
1128 if (IsNonTemporal) {
1129 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1130 // for both loads and stores, and the L2 cache policy to STREAM.
1131 Changed |= enableGLCBit(MI);
1132 Changed |= enableSLCBit(MI);
1133 return Changed;
1134 }
1135
1136 return Changed;
1137}
1138
1139bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1140 SIAtomicScope Scope,
1141 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1142 bool IsCrossAddrSpaceOrdering, Position Pos,
1143 AtomicOrdering Order) const {
1144 bool Changed = false;
1145
1146 MachineBasicBlock &MBB = *MI->getParent();
1147 DebugLoc DL = MI->getDebugLoc();
1148
1149 if (Pos == Position::AFTER)
1150 ++MI;
1151
1152 bool VMCnt = false;
1153 bool LGKMCnt = false;
1154
1155 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1156 SIAtomicAddrSpace::NONE) {
1157 switch (Scope) {
1158 case SIAtomicScope::SYSTEM:
1159 case SIAtomicScope::AGENT:
1160 VMCnt |= true;
1161 break;
1162 case SIAtomicScope::WORKGROUP:
1163 case SIAtomicScope::WAVEFRONT:
1164 case SIAtomicScope::SINGLETHREAD:
1165 // The L1 cache keeps all memory operations in order for
1166 // wavefronts in the same work-group.
1167 break;
1168 default:
1169 llvm_unreachable("Unsupported synchronization scope");
1170 }
1171 }
1172
1173 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1174 switch (Scope) {
1175 case SIAtomicScope::SYSTEM:
1176 case SIAtomicScope::AGENT:
1177 case SIAtomicScope::WORKGROUP:
1178 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1179 // not needed as LDS operations for all waves are executed in a total
1180 // global ordering as observed by all waves. Required if also
1181 // synchronizing with global/GDS memory as LDS operations could be
1182 // reordered with respect to later global/GDS memory operations of the
1183 // same wave.
1184 LGKMCnt |= IsCrossAddrSpaceOrdering;
1185 break;
1186 case SIAtomicScope::WAVEFRONT:
1187 case SIAtomicScope::SINGLETHREAD:
1188 // The LDS keeps all memory operations in order for
1189 // the same wavefront.
1190 break;
1191 default:
1192 llvm_unreachable("Unsupported synchronization scope");
1193 }
1194 }
1195
1196 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1197 switch (Scope) {
1198 case SIAtomicScope::SYSTEM:
1199 case SIAtomicScope::AGENT:
1200 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1201 // is not needed as GDS operations for all waves are executed in a total
1202 // global ordering as observed by all waves. Required if also
1203 // synchronizing with global/LDS memory as GDS operations could be
1204 // reordered with respect to later global/LDS memory operations of the
1205 // same wave.
1206 LGKMCnt |= IsCrossAddrSpaceOrdering;
1207 break;
1208 case SIAtomicScope::WORKGROUP:
1209 case SIAtomicScope::WAVEFRONT:
1210 case SIAtomicScope::SINGLETHREAD:
1211 // The GDS keeps all memory operations in order for
1212 // the same work-group.
1213 break;
1214 default:
1215 llvm_unreachable("Unsupported synchronization scope");
1216 }
1217 }
1218
1219 if (VMCnt || LGKMCnt) {
1220 unsigned WaitCntImmediate =
1222 VMCnt ? 0 : getVmcntBitMask(IV),
1224 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1225 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1226 .addImm(WaitCntImmediate);
1227 Changed = true;
1228 }
1229
1230 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1231 // at workgroup-scoped release operations that specify the LDS address space.
1232 // SIInsertWaitcnts will later replace this with a vmcnt().
1233 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1234 Scope == SIAtomicScope::WORKGROUP &&
1235 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1236 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1237 Changed = true;
1238 }
1239
1240 if (Pos == Position::AFTER)
1241 --MI;
1242
1243 return Changed;
1244}
1245
1246bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1247 SIAtomicScope Scope,
1248 SIAtomicAddrSpace AddrSpace,
1249 Position Pos) const {
1250 if (!InsertCacheInv)
1251 return false;
1252
1253 bool Changed = false;
1254
1255 MachineBasicBlock &MBB = *MI->getParent();
1256 DebugLoc DL = MI->getDebugLoc();
1257
1258 if (Pos == Position::AFTER)
1259 ++MI;
1260
1261 if (canAffectGlobalAddrSpace(AddrSpace)) {
1262 switch (Scope) {
1263 case SIAtomicScope::SYSTEM:
1264 case SIAtomicScope::AGENT:
1265 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1266 Changed = true;
1267 break;
1268 case SIAtomicScope::WORKGROUP:
1269 case SIAtomicScope::WAVEFRONT:
1270 case SIAtomicScope::SINGLETHREAD:
1271 // No cache to invalidate.
1272 break;
1273 default:
1274 llvm_unreachable("Unsupported synchronization scope");
1275 }
1276 }
1277
1278 /// The scratch address space does not need the global memory cache
1279 /// to be flushed as all memory operations by the same thread are
1280 /// sequentially consistent, and no other thread can access scratch
1281 /// memory.
1282
1283 /// Other address spaces do not have a cache.
1284
1285 if (Pos == Position::AFTER)
1286 --MI;
1287
1288 return Changed;
1289}
1290
1291bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1292 SIAtomicScope Scope,
1293 SIAtomicAddrSpace AddrSpace,
1294 bool IsCrossAddrSpaceOrdering,
1295 Position Pos) const {
1296 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1297 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1298}
1299
1300bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1301 SIAtomicScope Scope,
1302 SIAtomicAddrSpace AddrSpace,
1303 Position Pos) const {
1304 if (!InsertCacheInv)
1305 return false;
1306
1307 bool Changed = false;
1308
1309 MachineBasicBlock &MBB = *MI->getParent();
1310 DebugLoc DL = MI->getDebugLoc();
1311
1312 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1313
1314 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1315 ? AMDGPU::BUFFER_WBINVL1
1316 : AMDGPU::BUFFER_WBINVL1_VOL;
1317
1318 if (Pos == Position::AFTER)
1319 ++MI;
1320
1321 if (canAffectGlobalAddrSpace(AddrSpace)) {
1322 switch (Scope) {
1323 case SIAtomicScope::SYSTEM:
1324 case SIAtomicScope::AGENT:
1325 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1326 Changed = true;
1327 break;
1328 case SIAtomicScope::WORKGROUP:
1329 case SIAtomicScope::WAVEFRONT:
1330 case SIAtomicScope::SINGLETHREAD:
1331 // No cache to invalidate.
1332 break;
1333 default:
1334 llvm_unreachable("Unsupported synchronization scope");
1335 }
1336 }
1337
1338 /// The scratch address space does not need the global memory cache
1339 /// to be flushed as all memory operations by the same thread are
1340 /// sequentially consistent, and no other thread can access scratch
1341 /// memory.
1342
1343 /// Other address spaces do not have a cache.
1344
1345 if (Pos == Position::AFTER)
1346 --MI;
1347
1348 return Changed;
1349}
1350
1351bool SIGfx90ACacheControl::enableLoadCacheBypass(
1353 SIAtomicScope Scope,
1354 SIAtomicAddrSpace AddrSpace) const {
1355 assert(MI->mayLoad() && !MI->mayStore());
1356 bool Changed = false;
1357
1358 if (canAffectGlobalAddrSpace(AddrSpace)) {
1359 switch (Scope) {
1360 case SIAtomicScope::SYSTEM:
1361 case SIAtomicScope::AGENT:
1362 // Set the L1 cache policy to MISS_LRU.
1363 // Note: there is no L2 cache bypass policy at the ISA level.
1364 Changed |= enableGLCBit(MI);
1365 break;
1366 case SIAtomicScope::WORKGROUP:
1367 // In threadgroup split mode the waves of a work-group can be executing on
1368 // different CUs. Therefore need to bypass the L1 which is per CU.
1369 // Otherwise in non-threadgroup split mode all waves of a work-group are
1370 // on the same CU, and so the L1 does not need to be bypassed.
1371 if (ST.isTgSplitEnabled())
1372 Changed |= enableGLCBit(MI);
1373 break;
1374 case SIAtomicScope::WAVEFRONT:
1375 case SIAtomicScope::SINGLETHREAD:
1376 // No cache to bypass.
1377 break;
1378 default:
1379 llvm_unreachable("Unsupported synchronization scope");
1380 }
1381 }
1382
1383 /// The scratch address space does not need the global memory caches
1384 /// to be bypassed as all memory operations by the same thread are
1385 /// sequentially consistent, and no other thread can access scratch
1386 /// memory.
1387
1388 /// Other address spaces do not have a cache.
1389
1390 return Changed;
1391}
1392
1393bool SIGfx90ACacheControl::enableRMWCacheBypass(
1395 SIAtomicScope Scope,
1396 SIAtomicAddrSpace AddrSpace) const {
1397 assert(MI->mayLoad() && MI->mayStore());
1398 bool Changed = false;
1399
1400 if (canAffectGlobalAddrSpace(AddrSpace)) {
1401 switch (Scope) {
1402 case SIAtomicScope::SYSTEM:
1403 case SIAtomicScope::AGENT:
1404 /// Do not set glc for RMW atomic operations as they implicitly bypass
1405 /// the L1 cache, and the glc bit is instead used to indicate if they are
1406 /// return or no-return.
1407 break;
1408 case SIAtomicScope::WORKGROUP:
1409 case SIAtomicScope::WAVEFRONT:
1410 case SIAtomicScope::SINGLETHREAD:
1411 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1412 break;
1413 default:
1414 llvm_unreachable("Unsupported synchronization scope");
1415 }
1416 }
1417
1418 return Changed;
1419}
1420
1421bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1422 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1423 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1424 // Only handle load and store, not atomic read-modify-write insructions. The
1425 // latter use glc to indicate if the atomic returns a result and so must not
1426 // be used for cache control.
1427 assert(MI->mayLoad() ^ MI->mayStore());
1428
1429 // Only update load and store, not LLVM IR atomic read-modify-write
1430 // instructions. The latter are always marked as volatile so cannot sensibly
1431 // handle it as do not want to pessimize all atomics. Also they do not support
1432 // the nontemporal attribute.
1433 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1434
1435 bool Changed = false;
1436
1437 if (IsVolatile) {
1438 // Set L1 cache policy to be MISS_EVICT for load instructions
1439 // and MISS_LRU for store instructions.
1440 // Note: there is no L2 cache bypass policy at the ISA level.
1441 if (Op == SIMemOp::LOAD)
1442 Changed |= enableGLCBit(MI);
1443
1444 // Ensure operation has completed at system scope to cause all volatile
1445 // operations to be visible outside the program in a global order. Do not
1446 // request cross address space as only the global address space can be
1447 // observable outside the program, so no need to cause a waitcnt for LDS
1448 // address space operations.
1449 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1450 Position::AFTER, AtomicOrdering::Unordered);
1451
1452 return Changed;
1453 }
1454
1455 if (IsNonTemporal) {
1456 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1457 // for both loads and stores, and the L2 cache policy to STREAM.
1458 Changed |= enableGLCBit(MI);
1459 Changed |= enableSLCBit(MI);
1460 return Changed;
1461 }
1462
1463 return Changed;
1464}
1465
1466bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1467 SIAtomicScope Scope,
1468 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1469 bool IsCrossAddrSpaceOrdering,
1470 Position Pos,
1471 AtomicOrdering Order) const {
1472 if (ST.isTgSplitEnabled()) {
1473 // In threadgroup split mode the waves of a work-group can be executing on
1474 // different CUs. Therefore need to wait for global or GDS memory operations
1475 // to complete to ensure they are visible to waves in the other CUs.
1476 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1477 // the same CU, so no need to wait for global memory as all waves in the
1478 // work-group access the same the L1, nor wait for GDS as access are ordered
1479 // on a CU.
1480 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1481 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1482 (Scope == SIAtomicScope::WORKGROUP)) {
1483 // Same as GFX7 using agent scope.
1484 Scope = SIAtomicScope::AGENT;
1485 }
1486 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1487 // LDS memory operations.
1488 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1489 }
1490 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1491 IsCrossAddrSpaceOrdering, Pos, Order);
1492}
1493
1494bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1495 SIAtomicScope Scope,
1496 SIAtomicAddrSpace AddrSpace,
1497 Position Pos) const {
1498 if (!InsertCacheInv)
1499 return false;
1500
1501 bool Changed = false;
1502
1503 MachineBasicBlock &MBB = *MI->getParent();
1504 DebugLoc DL = MI->getDebugLoc();
1505
1506 if (Pos == Position::AFTER)
1507 ++MI;
1508
1509 if (canAffectGlobalAddrSpace(AddrSpace)) {
1510 switch (Scope) {
1511 case SIAtomicScope::SYSTEM:
1512 // Ensures that following loads will not see stale remote VMEM data or
1513 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1514 // CC will never be stale due to the local memory probes.
1515 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1516 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1517 // hardware does not reorder memory operations by the same wave with
1518 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1519 // remove any cache lines of earlier writes by the same wave and ensures
1520 // later reads by the same wave will refetch the cache lines.
1521 Changed = true;
1522 break;
1523 case SIAtomicScope::AGENT:
1524 // Same as GFX7.
1525 break;
1526 case SIAtomicScope::WORKGROUP:
1527 // In threadgroup split mode the waves of a work-group can be executing on
1528 // different CUs. Therefore need to invalidate the L1 which is per CU.
1529 // Otherwise in non-threadgroup split mode all waves of a work-group are
1530 // on the same CU, and so the L1 does not need to be invalidated.
1531 if (ST.isTgSplitEnabled()) {
1532 // Same as GFX7 using agent scope.
1533 Scope = SIAtomicScope::AGENT;
1534 }
1535 break;
1536 case SIAtomicScope::WAVEFRONT:
1537 case SIAtomicScope::SINGLETHREAD:
1538 // Same as GFX7.
1539 break;
1540 default:
1541 llvm_unreachable("Unsupported synchronization scope");
1542 }
1543 }
1544
1545 /// The scratch address space does not need the global memory cache
1546 /// to be flushed as all memory operations by the same thread are
1547 /// sequentially consistent, and no other thread can access scratch
1548 /// memory.
1549
1550 /// Other address spaces do not have a cache.
1551
1552 if (Pos == Position::AFTER)
1553 --MI;
1554
1555 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1556
1557 return Changed;
1558}
1559
1560bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1561 SIAtomicScope Scope,
1562 SIAtomicAddrSpace AddrSpace,
1563 bool IsCrossAddrSpaceOrdering,
1564 Position Pos) const {
1565 bool Changed = false;
1566
1567 MachineBasicBlock &MBB = *MI->getParent();
1568 const DebugLoc &DL = MI->getDebugLoc();
1569
1570 if (Pos == Position::AFTER)
1571 ++MI;
1572
1573 if (canAffectGlobalAddrSpace(AddrSpace)) {
1574 switch (Scope) {
1575 case SIAtomicScope::SYSTEM:
1576 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1577 // hardware does not reorder memory operations by the same wave with
1578 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1579 // to initiate writeback of any dirty cache lines of earlier writes by the
1580 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1581 // writeback has completed.
1582 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1583 // Set SC bits to indicate system scope.
1585 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1586 // vmcnt(0)" needed by the "BUFFER_WBL2".
1587 Changed = true;
1588 break;
1589 case SIAtomicScope::AGENT:
1590 case SIAtomicScope::WORKGROUP:
1591 case SIAtomicScope::WAVEFRONT:
1592 case SIAtomicScope::SINGLETHREAD:
1593 // Same as GFX7.
1594 break;
1595 default:
1596 llvm_unreachable("Unsupported synchronization scope");
1597 }
1598 }
1599
1600 if (Pos == Position::AFTER)
1601 --MI;
1602
1603 Changed |=
1604 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1605 IsCrossAddrSpaceOrdering, Pos);
1606
1607 return Changed;
1608}
1609
1610bool SIGfx940CacheControl::enableLoadCacheBypass(
1611 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1612 SIAtomicAddrSpace AddrSpace) const {
1613 assert(MI->mayLoad() && !MI->mayStore());
1614 bool Changed = false;
1615
1616 if (canAffectGlobalAddrSpace(AddrSpace)) {
1617 switch (Scope) {
1618 case SIAtomicScope::SYSTEM:
1619 // Set SC bits to indicate system scope.
1620 Changed |= enableSC0Bit(MI);
1621 Changed |= enableSC1Bit(MI);
1622 break;
1623 case SIAtomicScope::AGENT:
1624 // Set SC bits to indicate agent scope.
1625 Changed |= enableSC1Bit(MI);
1626 break;
1627 case SIAtomicScope::WORKGROUP:
1628 // In threadgroup split mode the waves of a work-group can be executing on
1629 // different CUs. Therefore need to bypass the L1 which is per CU.
1630 // Otherwise in non-threadgroup split mode all waves of a work-group are
1631 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1632 // bits to indicate work-group scope will do this automatically.
1633 Changed |= enableSC0Bit(MI);
1634 break;
1635 case SIAtomicScope::WAVEFRONT:
1636 case SIAtomicScope::SINGLETHREAD:
1637 // Leave SC bits unset to indicate wavefront scope.
1638 break;
1639 default:
1640 llvm_unreachable("Unsupported synchronization scope");
1641 }
1642 }
1643
1644 /// The scratch address space does not need the global memory caches
1645 /// to be bypassed as all memory operations by the same thread are
1646 /// sequentially consistent, and no other thread can access scratch
1647 /// memory.
1648
1649 /// Other address spaces do not have a cache.
1650
1651 return Changed;
1652}
1653
1654bool SIGfx940CacheControl::enableStoreCacheBypass(
1656 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1657 assert(!MI->mayLoad() && MI->mayStore());
1658 bool Changed = false;
1659
1660 if (canAffectGlobalAddrSpace(AddrSpace)) {
1661 switch (Scope) {
1662 case SIAtomicScope::SYSTEM:
1663 // Set SC bits to indicate system scope.
1664 Changed |= enableSC0Bit(MI);
1665 Changed |= enableSC1Bit(MI);
1666 break;
1667 case SIAtomicScope::AGENT:
1668 // Set SC bits to indicate agent scope.
1669 Changed |= enableSC1Bit(MI);
1670 break;
1671 case SIAtomicScope::WORKGROUP:
1672 // Set SC bits to indicate workgroup scope.
1673 Changed |= enableSC0Bit(MI);
1674 break;
1675 case SIAtomicScope::WAVEFRONT:
1676 case SIAtomicScope::SINGLETHREAD:
1677 // Leave SC bits unset to indicate wavefront scope.
1678 break;
1679 default:
1680 llvm_unreachable("Unsupported synchronization scope");
1681 }
1682 }
1683
1684 /// The scratch address space does not need the global memory caches
1685 /// to be bypassed as all memory operations by the same thread are
1686 /// sequentially consistent, and no other thread can access scratch
1687 /// memory.
1688
1689 /// Other address spaces do not have a cache.
1690
1691 return Changed;
1692}
1693
1694bool SIGfx940CacheControl::enableRMWCacheBypass(
1695 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1696 SIAtomicAddrSpace AddrSpace) const {
1697 assert(MI->mayLoad() && MI->mayStore());
1698 bool Changed = false;
1699
1700 if (canAffectGlobalAddrSpace(AddrSpace)) {
1701 switch (Scope) {
1702 case SIAtomicScope::SYSTEM:
1703 // Set SC1 bit to indicate system scope.
1704 Changed |= enableSC1Bit(MI);
1705 break;
1706 case SIAtomicScope::AGENT:
1707 case SIAtomicScope::WORKGROUP:
1708 case SIAtomicScope::WAVEFRONT:
1709 case SIAtomicScope::SINGLETHREAD:
1710 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1711 // to indicate system or agent scope. The SC0 bit is used to indicate if
1712 // they are return or no-return. Leave SC1 bit unset to indicate agent
1713 // scope.
1714 break;
1715 default:
1716 llvm_unreachable("Unsupported synchronization scope");
1717 }
1718 }
1719
1720 return Changed;
1721}
1722
1723bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1724 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1725 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1726 // Only handle load and store, not atomic read-modify-write insructions. The
1727 // latter use glc to indicate if the atomic returns a result and so must not
1728 // be used for cache control.
1729 assert(MI->mayLoad() ^ MI->mayStore());
1730
1731 // Only update load and store, not LLVM IR atomic read-modify-write
1732 // instructions. The latter are always marked as volatile so cannot sensibly
1733 // handle it as do not want to pessimize all atomics. Also they do not support
1734 // the nontemporal attribute.
1735 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1736
1737 bool Changed = false;
1738
1739 if (IsVolatile) {
1740 // Set SC bits to indicate system scope.
1741 Changed |= enableSC0Bit(MI);
1742 Changed |= enableSC1Bit(MI);
1743
1744 // Ensure operation has completed at system scope to cause all volatile
1745 // operations to be visible outside the program in a global order. Do not
1746 // request cross address space as only the global address space can be
1747 // observable outside the program, so no need to cause a waitcnt for LDS
1748 // address space operations.
1749 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1750 Position::AFTER, AtomicOrdering::Unordered);
1751
1752 return Changed;
1753 }
1754
1755 if (IsNonTemporal) {
1756 Changed |= enableNTBit(MI);
1757 return Changed;
1758 }
1759
1760 return Changed;
1761}
1762
1763bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1764 SIAtomicScope Scope,
1765 SIAtomicAddrSpace AddrSpace,
1766 Position Pos) const {
1767 if (!InsertCacheInv)
1768 return false;
1769
1770 bool Changed = false;
1771
1772 MachineBasicBlock &MBB = *MI->getParent();
1773 DebugLoc DL = MI->getDebugLoc();
1774
1775 if (Pos == Position::AFTER)
1776 ++MI;
1777
1778 if (canAffectGlobalAddrSpace(AddrSpace)) {
1779 switch (Scope) {
1780 case SIAtomicScope::SYSTEM:
1781 // Ensures that following loads will not see stale remote VMEM data or
1782 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1783 // CC will never be stale due to the local memory probes.
1784 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1785 // Set SC bits to indicate system scope.
1787 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1788 // hardware does not reorder memory operations by the same wave with
1789 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1790 // remove any cache lines of earlier writes by the same wave and ensures
1791 // later reads by the same wave will refetch the cache lines.
1792 Changed = true;
1793 break;
1794 case SIAtomicScope::AGENT:
1795 // Ensures that following loads will not see stale remote date or local
1796 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1797 // due to the memory probes.
1798 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1799 // Set SC bits to indicate agent scope.
1801 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1802 // does not reorder memory operations with respect to preceeding buffer
1803 // invalidate. The invalidate is guaranteed to remove any cache lines of
1804 // earlier writes and ensures later writes will refetch the cache lines.
1805 Changed = true;
1806 break;
1807 case SIAtomicScope::WORKGROUP:
1808 // In threadgroup split mode the waves of a work-group can be executing on
1809 // different CUs. Therefore need to invalidate the L1 which is per CU.
1810 // Otherwise in non-threadgroup split mode all waves of a work-group are
1811 // on the same CU, and so the L1 does not need to be invalidated.
1812 if (ST.isTgSplitEnabled()) {
1813 // Ensures L1 is invalidated if in threadgroup split mode. In
1814 // non-threadgroup split mode it is a NOP, but no point generating it in
1815 // that case if know not in that mode.
1816 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1817 // Set SC bits to indicate work-group scope.
1819 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1820 // does not reorder memory operations with respect to preceeding buffer
1821 // invalidate. The invalidate is guaranteed to remove any cache lines of
1822 // earlier writes and ensures later writes will refetch the cache lines.
1823 Changed = true;
1824 }
1825 break;
1826 case SIAtomicScope::WAVEFRONT:
1827 case SIAtomicScope::SINGLETHREAD:
1828 // Could generate "BUFFER_INV" but it would do nothing as there are no
1829 // caches to invalidate.
1830 break;
1831 default:
1832 llvm_unreachable("Unsupported synchronization scope");
1833 }
1834 }
1835
1836 /// The scratch address space does not need the global memory cache
1837 /// to be flushed as all memory operations by the same thread are
1838 /// sequentially consistent, and no other thread can access scratch
1839 /// memory.
1840
1841 /// Other address spaces do not have a cache.
1842
1843 if (Pos == Position::AFTER)
1844 --MI;
1845
1846 return Changed;
1847}
1848
1849bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1850 SIAtomicScope Scope,
1851 SIAtomicAddrSpace AddrSpace,
1852 bool IsCrossAddrSpaceOrdering,
1853 Position Pos) const {
1854 bool Changed = false;
1855
1856 MachineBasicBlock &MBB = *MI->getParent();
1857 DebugLoc DL = MI->getDebugLoc();
1858
1859 if (Pos == Position::AFTER)
1860 ++MI;
1861
1862 if (canAffectGlobalAddrSpace(AddrSpace)) {
1863 switch (Scope) {
1864 case SIAtomicScope::SYSTEM:
1865 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1866 // hardware does not reorder memory operations by the same wave with
1867 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1868 // to initiate writeback of any dirty cache lines of earlier writes by the
1869 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1870 // writeback has completed.
1871 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1872 // Set SC bits to indicate system scope.
1874 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1875 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1876 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1877 Changed = true;
1878 break;
1879 case SIAtomicScope::AGENT:
1880 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1881 // Set SC bits to indicate agent scope.
1883
1884 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1885 // SIAtomicScope::AGENT, the following insertWait will generate the
1886 // required "S_WAITCNT vmcnt(0)".
1887 Changed = true;
1888 break;
1889 case SIAtomicScope::WORKGROUP:
1890 case SIAtomicScope::WAVEFRONT:
1891 case SIAtomicScope::SINGLETHREAD:
1892 // Do not generate "BUFFER_WBL2" as there are no caches it would
1893 // writeback, and would require an otherwise unnecessary
1894 // "S_WAITCNT vmcnt(0)".
1895 break;
1896 default:
1897 llvm_unreachable("Unsupported synchronization scope");
1898 }
1899 }
1900
1901 if (Pos == Position::AFTER)
1902 --MI;
1903
1904 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1905 // S_WAITCNT needed.
1906 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1907 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1908
1909 return Changed;
1910}
1911
1912bool SIGfx10CacheControl::enableLoadCacheBypass(
1914 SIAtomicScope Scope,
1915 SIAtomicAddrSpace AddrSpace) const {
1916 assert(MI->mayLoad() && !MI->mayStore());
1917 bool Changed = false;
1918
1919 if (canAffectGlobalAddrSpace(AddrSpace)) {
1920 switch (Scope) {
1921 case SIAtomicScope::SYSTEM:
1922 case SIAtomicScope::AGENT:
1923 // Set the L0 and L1 cache policies to MISS_EVICT.
1924 // Note: there is no L2 cache coherent bypass control at the ISA level.
1925 Changed |= enableGLCBit(MI);
1926 Changed |= enableDLCBit(MI);
1927 break;
1928 case SIAtomicScope::WORKGROUP:
1929 // In WGP mode the waves of a work-group can be executing on either CU of
1930 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1931 // CU mode all waves of a work-group are on the same CU, and so the L0
1932 // does not need to be bypassed.
1933 if (!ST.isCuModeEnabled())
1934 Changed |= enableGLCBit(MI);
1935 break;
1936 case SIAtomicScope::WAVEFRONT:
1937 case SIAtomicScope::SINGLETHREAD:
1938 // No cache to bypass.
1939 break;
1940 default:
1941 llvm_unreachable("Unsupported synchronization scope");
1942 }
1943 }
1944
1945 /// The scratch address space does not need the global memory caches
1946 /// to be bypassed as all memory operations by the same thread are
1947 /// sequentially consistent, and no other thread can access scratch
1948 /// memory.
1949
1950 /// Other address spaces do not have a cache.
1951
1952 return Changed;
1953}
1954
1955bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1956 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1957 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1958
1959 // Only handle load and store, not atomic read-modify-write insructions. The
1960 // latter use glc to indicate if the atomic returns a result and so must not
1961 // be used for cache control.
1962 assert(MI->mayLoad() ^ MI->mayStore());
1963
1964 // Only update load and store, not LLVM IR atomic read-modify-write
1965 // instructions. The latter are always marked as volatile so cannot sensibly
1966 // handle it as do not want to pessimize all atomics. Also they do not support
1967 // the nontemporal attribute.
1968 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1969
1970 bool Changed = false;
1971
1972 if (IsVolatile) {
1973 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1974 // and MISS_LRU for store instructions.
1975 // Note: there is no L2 cache coherent bypass control at the ISA level.
1976 if (Op == SIMemOp::LOAD) {
1977 Changed |= enableGLCBit(MI);
1978 Changed |= enableDLCBit(MI);
1979 }
1980
1981 // Ensure operation has completed at system scope to cause all volatile
1982 // operations to be visible outside the program in a global order. Do not
1983 // request cross address space as only the global address space can be
1984 // observable outside the program, so no need to cause a waitcnt for LDS
1985 // address space operations.
1986 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1987 Position::AFTER, AtomicOrdering::Unordered);
1988 return Changed;
1989 }
1990
1991 if (IsNonTemporal) {
1992 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1993 // and L2 cache policy to STREAM.
1994 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1995 // to MISS_EVICT and the L2 cache policy to STREAM.
1996 if (Op == SIMemOp::STORE)
1997 Changed |= enableGLCBit(MI);
1998 Changed |= enableSLCBit(MI);
1999
2000 return Changed;
2001 }
2002
2003 return Changed;
2004}
2005
2006bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2007 SIAtomicScope Scope,
2008 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2009 bool IsCrossAddrSpaceOrdering,
2010 Position Pos, AtomicOrdering Order) const {
2011 bool Changed = false;
2012
2013 MachineBasicBlock &MBB = *MI->getParent();
2014 DebugLoc DL = MI->getDebugLoc();
2015
2016 if (Pos == Position::AFTER)
2017 ++MI;
2018
2019 bool VMCnt = false;
2020 bool VSCnt = false;
2021 bool LGKMCnt = false;
2022
2023 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2024 SIAtomicAddrSpace::NONE) {
2025 switch (Scope) {
2026 case SIAtomicScope::SYSTEM:
2027 case SIAtomicScope::AGENT:
2028 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2029 VMCnt |= true;
2030 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2031 VSCnt |= true;
2032 break;
2033 case SIAtomicScope::WORKGROUP:
2034 // In WGP mode the waves of a work-group can be executing on either CU of
2035 // the WGP. Therefore need to wait for operations to complete to ensure
2036 // they are visible to waves in the other CU as the L0 is per CU.
2037 // Otherwise in CU mode and all waves of a work-group are on the same CU
2038 // which shares the same L0.
2039 if (!ST.isCuModeEnabled()) {
2040 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2041 VMCnt |= true;
2042 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2043 VSCnt |= true;
2044 }
2045 break;
2046 case SIAtomicScope::WAVEFRONT:
2047 case SIAtomicScope::SINGLETHREAD:
2048 // The L0 cache keeps all memory operations in order for
2049 // work-items in the same wavefront.
2050 break;
2051 default:
2052 llvm_unreachable("Unsupported synchronization scope");
2053 }
2054 }
2055
2056 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2057 switch (Scope) {
2058 case SIAtomicScope::SYSTEM:
2059 case SIAtomicScope::AGENT:
2060 case SIAtomicScope::WORKGROUP:
2061 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2062 // not needed as LDS operations for all waves are executed in a total
2063 // global ordering as observed by all waves. Required if also
2064 // synchronizing with global/GDS memory as LDS operations could be
2065 // reordered with respect to later global/GDS memory operations of the
2066 // same wave.
2067 LGKMCnt |= IsCrossAddrSpaceOrdering;
2068 break;
2069 case SIAtomicScope::WAVEFRONT:
2070 case SIAtomicScope::SINGLETHREAD:
2071 // The LDS keeps all memory operations in order for
2072 // the same wavefront.
2073 break;
2074 default:
2075 llvm_unreachable("Unsupported synchronization scope");
2076 }
2077 }
2078
2079 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2080 switch (Scope) {
2081 case SIAtomicScope::SYSTEM:
2082 case SIAtomicScope::AGENT:
2083 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2084 // is not needed as GDS operations for all waves are executed in a total
2085 // global ordering as observed by all waves. Required if also
2086 // synchronizing with global/LDS memory as GDS operations could be
2087 // reordered with respect to later global/LDS memory operations of the
2088 // same wave.
2089 LGKMCnt |= IsCrossAddrSpaceOrdering;
2090 break;
2091 case SIAtomicScope::WORKGROUP:
2092 case SIAtomicScope::WAVEFRONT:
2093 case SIAtomicScope::SINGLETHREAD:
2094 // The GDS keeps all memory operations in order for
2095 // the same work-group.
2096 break;
2097 default:
2098 llvm_unreachable("Unsupported synchronization scope");
2099 }
2100 }
2101
2102 if (VMCnt || LGKMCnt) {
2103 unsigned WaitCntImmediate =
2105 VMCnt ? 0 : getVmcntBitMask(IV),
2107 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2108 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2109 .addImm(WaitCntImmediate);
2110 Changed = true;
2111 }
2112
2113 // On architectures that support direct loads to LDS, emit an unknown waitcnt
2114 // at workgroup-scoped release operations that specify the LDS address space.
2115 // SIInsertWaitcnts will later replace this with a vmcnt().
2116 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
2117 Scope == SIAtomicScope::WORKGROUP &&
2118 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2119 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
2120 Changed = true;
2121 }
2122
2123 if (VSCnt) {
2124 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2125 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2126 .addImm(0);
2127 Changed = true;
2128 }
2129
2130 if (Pos == Position::AFTER)
2131 --MI;
2132
2133 return Changed;
2134}
2135
2136bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2137 SIAtomicScope Scope,
2138 SIAtomicAddrSpace AddrSpace,
2139 Position Pos) const {
2140 if (!InsertCacheInv)
2141 return false;
2142
2143 bool Changed = false;
2144
2145 MachineBasicBlock &MBB = *MI->getParent();
2146 DebugLoc DL = MI->getDebugLoc();
2147
2148 if (Pos == Position::AFTER)
2149 ++MI;
2150
2151 if (canAffectGlobalAddrSpace(AddrSpace)) {
2152 switch (Scope) {
2153 case SIAtomicScope::SYSTEM:
2154 case SIAtomicScope::AGENT:
2155 // The order of invalidates matter here. We must invalidate "outer in"
2156 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2157 // invalidated.
2158 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2159 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2160 Changed = true;
2161 break;
2162 case SIAtomicScope::WORKGROUP:
2163 // In WGP mode the waves of a work-group can be executing on either CU of
2164 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2165 // in CU mode and all waves of a work-group are on the same CU, and so the
2166 // L0 does not need to be invalidated.
2167 if (!ST.isCuModeEnabled()) {
2168 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2169 Changed = true;
2170 }
2171 break;
2172 case SIAtomicScope::WAVEFRONT:
2173 case SIAtomicScope::SINGLETHREAD:
2174 // No cache to invalidate.
2175 break;
2176 default:
2177 llvm_unreachable("Unsupported synchronization scope");
2178 }
2179 }
2180
2181 /// The scratch address space does not need the global memory cache
2182 /// to be flushed as all memory operations by the same thread are
2183 /// sequentially consistent, and no other thread can access scratch
2184 /// memory.
2185
2186 /// Other address spaces do not have a cache.
2187
2188 if (Pos == Position::AFTER)
2189 --MI;
2190
2191 return Changed;
2192}
2193
2194bool SIGfx10CacheControl::insertBarrierStart(
2196 // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
2197 // mode. This is because a CU mode release fence does not emit any wait, which
2198 // is fine when only dealing with vmem, but isn't sufficient in the presence
2199 // of barriers which do not go through vmem.
2200 // GFX12.5 does not require this additional wait.
2201 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
2202 return false;
2203
2204 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2205 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
2207 return true;
2208}
2209
2210bool SIGfx11CacheControl::enableLoadCacheBypass(
2211 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2212 SIAtomicAddrSpace AddrSpace) const {
2213 assert(MI->mayLoad() && !MI->mayStore());
2214 bool Changed = false;
2215
2216 if (canAffectGlobalAddrSpace(AddrSpace)) {
2217 switch (Scope) {
2218 case SIAtomicScope::SYSTEM:
2219 case SIAtomicScope::AGENT:
2220 // Set the L0 and L1 cache policies to MISS_EVICT.
2221 // Note: there is no L2 cache coherent bypass control at the ISA level.
2222 Changed |= enableGLCBit(MI);
2223 break;
2224 case SIAtomicScope::WORKGROUP:
2225 // In WGP mode the waves of a work-group can be executing on either CU of
2226 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2227 // CU mode all waves of a work-group are on the same CU, and so the L0
2228 // does not need to be bypassed.
2229 if (!ST.isCuModeEnabled())
2230 Changed |= enableGLCBit(MI);
2231 break;
2232 case SIAtomicScope::WAVEFRONT:
2233 case SIAtomicScope::SINGLETHREAD:
2234 // No cache to bypass.
2235 break;
2236 default:
2237 llvm_unreachable("Unsupported synchronization scope");
2238 }
2239 }
2240
2241 /// The scratch address space does not need the global memory caches
2242 /// to be bypassed as all memory operations by the same thread are
2243 /// sequentially consistent, and no other thread can access scratch
2244 /// memory.
2245
2246 /// Other address spaces do not have a cache.
2247
2248 return Changed;
2249}
2250
2251bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2252 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2253 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2254
2255 // Only handle load and store, not atomic read-modify-write insructions. The
2256 // latter use glc to indicate if the atomic returns a result and so must not
2257 // be used for cache control.
2258 assert(MI->mayLoad() ^ MI->mayStore());
2259
2260 // Only update load and store, not LLVM IR atomic read-modify-write
2261 // instructions. The latter are always marked as volatile so cannot sensibly
2262 // handle it as do not want to pessimize all atomics. Also they do not support
2263 // the nontemporal attribute.
2264 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2265
2266 bool Changed = false;
2267
2268 if (IsVolatile) {
2269 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2270 // and MISS_LRU for store instructions.
2271 // Note: there is no L2 cache coherent bypass control at the ISA level.
2272 if (Op == SIMemOp::LOAD)
2273 Changed |= enableGLCBit(MI);
2274
2275 // Set MALL NOALLOC for load and store instructions.
2276 Changed |= enableDLCBit(MI);
2277
2278 // Ensure operation has completed at system scope to cause all volatile
2279 // operations to be visible outside the program in a global order. Do not
2280 // request cross address space as only the global address space can be
2281 // observable outside the program, so no need to cause a waitcnt for LDS
2282 // address space operations.
2283 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2284 Position::AFTER, AtomicOrdering::Unordered);
2285 return Changed;
2286 }
2287
2288 if (IsNonTemporal) {
2289 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2290 // and L2 cache policy to STREAM.
2291 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2292 // to MISS_EVICT and the L2 cache policy to STREAM.
2293 if (Op == SIMemOp::STORE)
2294 Changed |= enableGLCBit(MI);
2295 Changed |= enableSLCBit(MI);
2296
2297 // Set MALL NOALLOC for load and store instructions.
2298 Changed |= enableDLCBit(MI);
2299 return Changed;
2300 }
2301
2302 return Changed;
2303}
2304
2305bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2306 AMDGPU::CPol::CPol Value) const {
2307 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2308 if (!CPol)
2309 return false;
2310
2311 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2312 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2313 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2314 return true;
2315 }
2316
2317 return false;
2318}
2319
2320bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2321 AMDGPU::CPol::CPol Value) const {
2322 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2323 if (!CPol)
2324 return false;
2325
2326 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2327 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2328 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2329 return true;
2330 }
2331
2332 return false;
2333}
2334
2335bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2336 const MachineBasicBlock::iterator MI) const {
2337 // TODO: implement flag for frontend to give us a hint not to insert waits.
2338
2339 MachineBasicBlock &MBB = *MI->getParent();
2340 const DebugLoc &DL = MI->getDebugLoc();
2341
2342 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2343 if (ST.hasImageInsts()) {
2344 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2345 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2346 }
2347 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2348 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2349
2350 return true;
2351}
2352
2353bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2354 SIAtomicScope Scope,
2355 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2356 bool IsCrossAddrSpaceOrdering,
2357 Position Pos, AtomicOrdering Order) const {
2358 bool Changed = false;
2359
2360 MachineBasicBlock &MBB = *MI->getParent();
2361 DebugLoc DL = MI->getDebugLoc();
2362
2363 bool LOADCnt = false;
2364 bool DSCnt = false;
2365 bool STORECnt = false;
2366
2367 if (Pos == Position::AFTER)
2368 ++MI;
2369
2370 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2371 SIAtomicAddrSpace::NONE) {
2372 switch (Scope) {
2373 case SIAtomicScope::SYSTEM:
2374 case SIAtomicScope::AGENT:
2375 case SIAtomicScope::CLUSTER:
2376 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2377 LOADCnt |= true;
2378 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2379 STORECnt |= true;
2380 break;
2381 case SIAtomicScope::WORKGROUP:
2382 // GFX12.0:
2383 // In WGP mode the waves of a work-group can be executing on either CU
2384 // of the WGP. Therefore need to wait for operations to complete to
2385 // ensure they are visible to waves in the other CU as the L0 is per CU.
2386 // Otherwise in CU mode and all waves of a work-group are on the same CU
2387 // which shares the same L0.
2388 //
2389 // GFX12.5:
2390 // CU$ has two ports. To ensure operations are visible at the workgroup
2391 // level, we need to ensure all operations in this port have completed
2392 // so the other SIMDs in the WG can see them. There is no ordering
2393 // guarantee between the ports.
2394 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
2395 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2396 LOADCnt |= true;
2397 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2398 STORECnt |= true;
2399 }
2400 break;
2401 case SIAtomicScope::WAVEFRONT:
2402 case SIAtomicScope::SINGLETHREAD:
2403 // The L0 cache keeps all memory operations in order for
2404 // work-items in the same wavefront.
2405 break;
2406 default:
2407 llvm_unreachable("Unsupported synchronization scope");
2408 }
2409 }
2410
2411 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2412 switch (Scope) {
2413 case SIAtomicScope::SYSTEM:
2414 case SIAtomicScope::AGENT:
2415 case SIAtomicScope::CLUSTER:
2416 case SIAtomicScope::WORKGROUP:
2417 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2418 // not needed as LDS operations for all waves are executed in a total
2419 // global ordering as observed by all waves. Required if also
2420 // synchronizing with global/GDS memory as LDS operations could be
2421 // reordered with respect to later global/GDS memory operations of the
2422 // same wave.
2423 DSCnt |= IsCrossAddrSpaceOrdering;
2424 break;
2425 case SIAtomicScope::WAVEFRONT:
2426 case SIAtomicScope::SINGLETHREAD:
2427 // The LDS keeps all memory operations in order for
2428 // the same wavefront.
2429 break;
2430 default:
2431 llvm_unreachable("Unsupported synchronization scope");
2432 }
2433 }
2434
2435 if (LOADCnt) {
2436 // Acquire sequences only need to wait on the previous atomic operation.
2437 // e.g. a typical sequence looks like
2438 // atomic load
2439 // (wait)
2440 // global_inv
2441 //
2442 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2443 // to be tracked using loadcnt.
2444 //
2445 // This also applies to fences. Fences cannot pair with an instruction
2446 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2447 if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
2448 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2449 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2450 }
2451 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2452 Changed = true;
2453 }
2454
2455 if (STORECnt) {
2456 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2457 Changed = true;
2458 }
2459
2460 if (DSCnt) {
2461 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2462 Changed = true;
2463 }
2464
2465 if (Pos == Position::AFTER)
2466 --MI;
2467
2468 return Changed;
2469}
2470
2471bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2472 SIAtomicScope Scope,
2473 SIAtomicAddrSpace AddrSpace,
2474 Position Pos) const {
2475 if (!InsertCacheInv)
2476 return false;
2477
2478 MachineBasicBlock &MBB = *MI->getParent();
2479 DebugLoc DL = MI->getDebugLoc();
2480
2481 /// The scratch address space does not need the global memory cache
2482 /// to be flushed as all memory operations by the same thread are
2483 /// sequentially consistent, and no other thread can access scratch
2484 /// memory.
2485
2486 /// Other address spaces do not have a cache.
2487 if (!canAffectGlobalAddrSpace(AddrSpace))
2488 return false;
2489
2491 switch (Scope) {
2492 case SIAtomicScope::SYSTEM:
2493 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2494 break;
2495 case SIAtomicScope::AGENT:
2496 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2497 break;
2498 case SIAtomicScope::CLUSTER:
2499 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2500 break;
2501 case SIAtomicScope::WORKGROUP:
2502 // GFX12.0:
2503 // In WGP mode the waves of a work-group can be executing on either CU of
2504 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2505 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2506 // so the L0 does not need to be invalidated.
2507 //
2508 // GFX12.5 has a shared WGP$, so no invalidates are required.
2509 if (ST.isCuModeEnabled())
2510 return false;
2511
2512 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2513 break;
2514 case SIAtomicScope::WAVEFRONT:
2515 case SIAtomicScope::SINGLETHREAD:
2516 // No cache to invalidate.
2517 return false;
2518 default:
2519 llvm_unreachable("Unsupported synchronization scope");
2520 }
2521
2522 if (Pos == Position::AFTER)
2523 ++MI;
2524
2525 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2526
2527 if (Pos == Position::AFTER)
2528 --MI;
2529
2530 return true;
2531}
2532
2533bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2534 SIAtomicScope Scope,
2535 SIAtomicAddrSpace AddrSpace,
2536 bool IsCrossAddrSpaceOrdering,
2537 Position Pos) const {
2538 bool Changed = false;
2539
2540 MachineBasicBlock &MBB = *MI->getParent();
2541 DebugLoc DL = MI->getDebugLoc();
2542
2543 // The scratch address space does not need the global memory cache
2544 // writeback as all memory operations by the same thread are
2545 // sequentially consistent, and no other thread can access scratch
2546 // memory.
2547 if (canAffectGlobalAddrSpace(AddrSpace)) {
2548 if (Pos == Position::AFTER)
2549 ++MI;
2550
2551 // global_wb is only necessary at system scope for GFX12.0,
2552 // they're also necessary at device scope for GFX12.5 as stores
2553 // cannot report completion earlier than L2.
2554 //
2555 // Emitting it for lower scopes is a slow no-op, so we omit it
2556 // for performance.
2557 switch (Scope) {
2558 case SIAtomicScope::SYSTEM:
2559 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2561 Changed = true;
2562 break;
2563 case SIAtomicScope::AGENT:
2564 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2565 if (ST.hasGFX1250Insts()) {
2566 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2568 Changed = true;
2569 }
2570 break;
2571 case SIAtomicScope::CLUSTER:
2572 case SIAtomicScope::WORKGROUP:
2573 // No WB necessary, but we still have to wait.
2574 case SIAtomicScope::WAVEFRONT:
2575 case SIAtomicScope::SINGLETHREAD:
2576 // No WB or wait necessary here, but insertWait takes care of that.
2577 break;
2578 default:
2579 llvm_unreachable("Unsupported synchronization scope");
2580 }
2581
2582 if (Pos == Position::AFTER)
2583 --MI;
2584 }
2585
2586 // We always have to wait for previous memory operations (load/store) to
2587 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2588 // we of course need to wait for that as well.
2589 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2590 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2591
2592 return Changed;
2593}
2594
2595bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2596 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2597 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2598
2599 // Only handle load and store, not atomic read-modify-write instructions.
2600 assert(MI->mayLoad() ^ MI->mayStore());
2601
2602 // Only update load and store, not LLVM IR atomic read-modify-write
2603 // instructions. The latter are always marked as volatile so cannot sensibly
2604 // handle it as do not want to pessimize all atomics. Also they do not support
2605 // the nontemporal attribute.
2606 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2607
2608 bool Changed = false;
2609
2610 if (IsLastUse) {
2611 // Set last-use hint.
2612 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2613 } else if (IsNonTemporal) {
2614 // Set non-temporal hint for all cache levels.
2615 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2616 }
2617
2618 if (IsVolatile) {
2619 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2620
2621 // Ensure operation has completed at system scope to cause all volatile
2622 // operations to be visible outside the program in a global order. Do not
2623 // request cross address space as only the global address space can be
2624 // observable outside the program, so no need to cause a waitcnt for LDS
2625 // address space operations.
2626 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2627 Position::AFTER, AtomicOrdering::Unordered);
2628 }
2629
2630 return Changed;
2631}
2632
2633bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2634 assert(MI.mayStore() && "Not a Store inst");
2635 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2636 bool Changed = false;
2637
2638 // GFX12.5 only: xcnt wait is needed before flat and global atomics
2639 // stores/rmw.
2640 if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2641 MachineBasicBlock &MBB = *MI.getParent();
2642 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2643 Changed = true;
2644 }
2645
2646 // Remaining fixes do not apply to RMWs.
2647 if (IsRMW)
2648 return Changed;
2649
2650 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2651 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2652 return Changed;
2653 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2654
2655 // GFX12.0 only: Extra waits needed before system scope stores.
2656 if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
2657 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2658
2659 return Changed;
2660}
2661
2662bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2663 if (!ST.hasGFX1250Insts())
2664 return false;
2665
2666 // Cooperative atomics need to be SCOPE_DEV or higher.
2667 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2668 assert(CPol && "No CPol operand?");
2669 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2670 if (Scope < CPol::SCOPE_DEV)
2671 return setScope(MI, CPol::SCOPE_DEV);
2672 return false;
2673}
2674
2675bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2676 SIAtomicScope Scope,
2677 SIAtomicAddrSpace AddrSpace) const {
2678 bool Changed = false;
2679
2680 if (canAffectGlobalAddrSpace(AddrSpace)) {
2681 switch (Scope) {
2682 case SIAtomicScope::SYSTEM:
2683 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2684 break;
2685 case SIAtomicScope::AGENT:
2686 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2687 break;
2688 case SIAtomicScope::CLUSTER:
2689 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2690 break;
2691 case SIAtomicScope::WORKGROUP:
2692 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2693 // different CUs that access different L0s.
2694 if (!ST.isCuModeEnabled())
2695 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2696 break;
2697 case SIAtomicScope::WAVEFRONT:
2698 case SIAtomicScope::SINGLETHREAD:
2699 // No cache to bypass.
2700 break;
2701 default:
2702 llvm_unreachable("Unsupported synchronization scope");
2703 }
2704 }
2705
2706 // The scratch address space does not need the global memory caches
2707 // to be bypassed as all memory operations by the same thread are
2708 // sequentially consistent, and no other thread can access scratch
2709 // memory.
2710
2711 // Other address spaces do not have a cache.
2712
2713 return Changed;
2714}
2715
2716bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2717 if (AtomicPseudoMIs.empty())
2718 return false;
2719
2720 for (auto &MI : AtomicPseudoMIs)
2721 MI->eraseFromParent();
2722
2723 AtomicPseudoMIs.clear();
2724 return true;
2725}
2726
2727bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2729 assert(MI->mayLoad() && !MI->mayStore());
2730
2731 bool Changed = false;
2732
2733 if (MOI.isAtomic()) {
2734 const AtomicOrdering Order = MOI.getOrdering();
2735 if (Order == AtomicOrdering::Monotonic ||
2736 Order == AtomicOrdering::Acquire ||
2737 Order == AtomicOrdering::SequentiallyConsistent) {
2738 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2739 MOI.getOrderingAddrSpace());
2740 }
2741
2742 // Handle cooperative atomics after cache bypass step, as it may override
2743 // the scope of the instruction to a greater scope.
2744 if (MOI.isCooperative())
2745 Changed |= CC->handleCooperativeAtomic(*MI);
2746
2747 if (Order == AtomicOrdering::SequentiallyConsistent)
2748 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2749 SIMemOp::LOAD | SIMemOp::STORE,
2750 MOI.getIsCrossAddressSpaceOrdering(),
2751 Position::BEFORE, Order);
2752
2753 if (Order == AtomicOrdering::Acquire ||
2754 Order == AtomicOrdering::SequentiallyConsistent) {
2755 Changed |= CC->insertWait(
2756 MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2757 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2758 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2759 MOI.getOrderingAddrSpace(),
2760 Position::AFTER);
2761 }
2762
2763 return Changed;
2764 }
2765
2766 // Atomic instructions already bypass caches to the scope specified by the
2767 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2768 // instructions need additional treatment.
2769 Changed |= CC->enableVolatileAndOrNonTemporal(
2770 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2771 MOI.isNonTemporal(), MOI.isLastUse());
2772
2773 return Changed;
2774}
2775
2776bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2778 assert(!MI->mayLoad() && MI->mayStore());
2779
2780 bool Changed = false;
2781 // FIXME: Necessary hack because iterator can lose track of the store.
2782 MachineInstr &StoreMI = *MI;
2783
2784 if (MOI.isAtomic()) {
2785 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2786 MOI.getOrdering() == AtomicOrdering::Release ||
2787 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2788 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2789 MOI.getOrderingAddrSpace());
2790 }
2791
2792 // Handle cooperative atomics after cache bypass step, as it may override
2793 // the scope of the instruction to a greater scope.
2794 if (MOI.isCooperative())
2795 Changed |= CC->handleCooperativeAtomic(*MI);
2796
2797 if (MOI.getOrdering() == AtomicOrdering::Release ||
2798 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2799 Changed |= CC->insertRelease(MI, MOI.getScope(),
2800 MOI.getOrderingAddrSpace(),
2801 MOI.getIsCrossAddressSpaceOrdering(),
2802 Position::BEFORE);
2803
2804 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2805 return Changed;
2806 }
2807
2808 // Atomic instructions already bypass caches to the scope specified by the
2809 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2810 // need additional treatment.
2811 Changed |= CC->enableVolatileAndOrNonTemporal(
2812 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2813 MOI.isNonTemporal());
2814
2815 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2816 // instruction field, do not confuse it with atomic scope.
2817 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2818 return Changed;
2819}
2820
2821bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2823 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2824
2825 AtomicPseudoMIs.push_back(MI);
2826 bool Changed = false;
2827
2828 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2829
2830 if (MOI.isAtomic()) {
2831 const AtomicOrdering Order = MOI.getOrdering();
2832 if (Order == AtomicOrdering::Acquire) {
2833 Changed |= CC->insertWait(
2834 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2835 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2836 }
2837
2838 if (Order == AtomicOrdering::Release ||
2839 Order == AtomicOrdering::AcquireRelease ||
2840 Order == AtomicOrdering::SequentiallyConsistent)
2841 /// TODO: This relies on a barrier always generating a waitcnt
2842 /// for LDS to ensure it is not reordered with the completion of
2843 /// the proceeding LDS operations. If barrier had a memory
2844 /// ordering and memory scope, then library does not need to
2845 /// generate a fence. Could add support in this file for
2846 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2847 /// adding S_WAITCNT before a S_BARRIER.
2848 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2849 MOI.getIsCrossAddressSpaceOrdering(),
2850 Position::BEFORE);
2851
2852 // TODO: If both release and invalidate are happening they could be combined
2853 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2854 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2855 // track cache invalidate and write back instructions.
2856
2857 if (Order == AtomicOrdering::Acquire ||
2858 Order == AtomicOrdering::AcquireRelease ||
2859 Order == AtomicOrdering::SequentiallyConsistent)
2860 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2861 Position::BEFORE);
2862
2863 return Changed;
2864 }
2865
2866 return Changed;
2867}
2868
2869bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2871 assert(MI->mayLoad() && MI->mayStore());
2872
2873 bool Changed = false;
2874 MachineInstr &RMWMI = *MI;
2875
2876 if (MOI.isAtomic()) {
2877 const AtomicOrdering Order = MOI.getOrdering();
2878 if (Order == AtomicOrdering::Monotonic ||
2879 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2880 Order == AtomicOrdering::AcquireRelease ||
2881 Order == AtomicOrdering::SequentiallyConsistent) {
2882 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2883 MOI.getInstrAddrSpace());
2884 }
2885
2886 if (Order == AtomicOrdering::Release ||
2887 Order == AtomicOrdering::AcquireRelease ||
2888 Order == AtomicOrdering::SequentiallyConsistent ||
2889 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2890 Changed |= CC->insertRelease(MI, MOI.getScope(),
2891 MOI.getOrderingAddrSpace(),
2892 MOI.getIsCrossAddressSpaceOrdering(),
2893 Position::BEFORE);
2894
2895 if (Order == AtomicOrdering::Acquire ||
2896 Order == AtomicOrdering::AcquireRelease ||
2897 Order == AtomicOrdering::SequentiallyConsistent ||
2898 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2899 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2900 Changed |= CC->insertWait(
2901 MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2902 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2903 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2904 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2905 MOI.getOrderingAddrSpace(),
2906 Position::AFTER);
2907 }
2908
2909 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2910 return Changed;
2911 }
2912
2913 return Changed;
2914}
2915
2916bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2917 const MachineModuleInfo &MMI =
2918 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2919 return SIMemoryLegalizer(MMI).run(MF);
2920}
2921
2922PreservedAnalyses
2926 .getCachedResult<MachineModuleAnalysis>(
2927 *MF.getFunction().getParent());
2928 assert(MMI && "MachineModuleAnalysis must be available");
2929 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2930 return PreservedAnalyses::all();
2932}
2933
2934bool SIMemoryLegalizer::run(MachineFunction &MF) {
2935 bool Changed = false;
2936
2937 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2938 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2939 CC = SICacheControl::create(ST);
2940
2941 for (auto &MBB : MF) {
2942 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2943
2944 // Unbundle instructions after the post-RA scheduler.
2945 if (MI->isBundle() && MI->mayLoadOrStore()) {
2946 MachineBasicBlock::instr_iterator II(MI->getIterator());
2947 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2948 I != E && I->isBundledWithPred(); ++I) {
2949 I->unbundleFromPred();
2950 for (MachineOperand &MO : I->operands())
2951 if (MO.isReg())
2952 MO.setIsInternalRead(false);
2953 }
2954
2955 MI->eraseFromParent();
2956 MI = II->getIterator();
2957 }
2958
2959 if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
2960 Changed |= CC->insertBarrierStart(MI);
2961 continue;
2962 }
2963
2964 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2965 continue;
2966
2967 if (const auto &MOI = MOA.getLoadInfo(MI))
2968 Changed |= expandLoad(*MOI, MI);
2969 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2970 Changed |= expandStore(*MOI, MI);
2971 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2972 Changed |= expandAtomicFence(*MOI, MI);
2973 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2974 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2975 }
2976 }
2977
2978 Changed |= removeAtomicPseudoMIs();
2979 return Changed;
2980}
2981
2982INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2983
2984char SIMemoryLegalizerLegacy::ID = 0;
2985char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2986
2988 return new SIMemoryLegalizerLegacy();
2989}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define I(x, y, z)
Definition MD5.cpp:58
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isAtomicRet(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
DWARFExpression::Operation Op
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()