LLVM: lib/Target/AMDGPU/SIMemoryLegalizer.cpp Source File

//===- SIMemoryLegalizer.cpp ----------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Memory legalizer - implements memory model. More information can be

/// found here:

///   http://llvm.org/docs/AMDGPUUsage.html#memory-model

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "AMDGPUMachineModuleInfo.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "llvm/ADT/BitmaskEnum.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/CodeGen/MachinePassManager.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/MemoryModelRelaxationAnnotations.h"

#include "llvm/IR/PassManager.h"

#include "llvm/Support/AtomicOrdering.h"

#include "llvm/TargetParser/TargetParser.h"


using namespace llvm;

using namespace llvm::AMDGPU;


#define DEBUG_TYPE "si-memory-legalizer"

#define PASS_NAME "SI Memory Legalizer"


static cl::opt<bool> AmdgcnSkipCacheInvalidations(

    "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,

    cl::desc("Use this to skip inserting cache invalidating instructions."));


namespace {


LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();


/// Memory operation flags. Can be ORed together.

enum class SIMemOp {

  NONE = 0u,

  LOAD = 1u << 0,

  STORE = 1u << 1,

  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)

};


/// Position to insert a new instruction relative to an existing

/// instruction.

enum class Position {

  BEFORE,

  AFTER

};


/// The atomic synchronization scopes supported by the AMDGPU target.

enum class SIAtomicScope {

  NONE,

  SINGLETHREAD,

  WAVEFRONT,

  WORKGROUP,

  CLUSTER, // Promoted to AGENT on targets without workgroup clusters.

  AGENT,

  SYSTEM

};


/// The distinct address spaces supported by the AMDGPU target for

/// atomic memory operation. Can be ORed together.

enum class SIAtomicAddrSpace {

  NONE = 0u,

  GLOBAL = 1u << 0,

  LDS = 1u << 1,

  SCRATCH = 1u << 2,

  GDS = 1u << 3,

  OTHER = 1u << 4,


  /// The address spaces that can be accessed by a FLAT instruction.

  FLAT = GLOBAL | LDS | SCRATCH,


  /// The address spaces that support atomic instructions.

  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,


  /// All address spaces.

  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,


  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)

};


class SIMemOpInfo final {

private:


  friend class SIMemOpAccess;


  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;

  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;

  SIAtomicScope Scope = SIAtomicScope::SYSTEM;

  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;

  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;

  bool IsCrossAddressSpaceOrdering = false;

  bool IsVolatile = false;

  bool IsNonTemporal = false;

  bool IsLastUse = false;

  bool IsCooperative = false;


  SIMemOpInfo(

      const GCNSubtarget &ST,

      AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,

      SIAtomicScope Scope = SIAtomicScope::SYSTEM,

      SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,

      SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,

      bool IsCrossAddressSpaceOrdering = true,

      AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,

      bool IsVolatile = false, bool IsNonTemporal = false,

      bool IsLastUse = false, bool IsCooperative = false)

      : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),

        OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),

        IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),

        IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),

        IsLastUse(IsLastUse), IsCooperative(IsCooperative) {


    if (Ordering == AtomicOrdering::NotAtomic) {

      assert(!IsCooperative && "Cannot be cooperative & non-atomic!");

      assert(Scope == SIAtomicScope::NONE &&

             OrderingAddrSpace == SIAtomicAddrSpace::NONE &&

             !IsCrossAddressSpaceOrdering &&

             FailureOrdering == AtomicOrdering::NotAtomic);

      return;

    }


    assert(Scope != SIAtomicScope::NONE &&

           (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=

               SIAtomicAddrSpace::NONE &&

           (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=

               SIAtomicAddrSpace::NONE);


    // There is also no cross address space ordering if the ordering

    // address space is the same as the instruction address space and

    // only contains a single address space.

    if ((OrderingAddrSpace == InstrAddrSpace) &&

        isPowerOf2_32(uint32_t(InstrAddrSpace)))

      this->IsCrossAddressSpaceOrdering = false;


    // Limit the scope to the maximum supported by the instruction's address

    // spaces.

    if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==

        SIAtomicAddrSpace::NONE) {

      this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);

    } else if ((InstrAddrSpace &

                ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==

               SIAtomicAddrSpace::NONE) {

      this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);

    } else if ((InstrAddrSpace &

                ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |

                  SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {

      this->Scope = std::min(Scope, SIAtomicScope::AGENT);

    }


    // On targets that have no concept of a workgroup cluster, use

    // AGENT scope as a conservatively correct alternative.

    if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())

      this->Scope = SIAtomicScope::AGENT;

  }


public:

  /// \returns Atomic synchronization scope of the machine instruction used to

  /// create this SIMemOpInfo.

  SIAtomicScope getScope() const {

    return Scope;

  }


  /// \returns Ordering constraint of the machine instruction used to

  /// create this SIMemOpInfo.

  AtomicOrdering getOrdering() const {

    return Ordering;

  }


  /// \returns Failure ordering constraint of the machine instruction used to

  /// create this SIMemOpInfo.

  AtomicOrdering getFailureOrdering() const {

    return FailureOrdering;

  }


  /// \returns The address spaces be accessed by the machine

  /// instruction used to create this SIMemOpInfo.

  SIAtomicAddrSpace getInstrAddrSpace() const {

    return InstrAddrSpace;

  }


  /// \returns The address spaces that must be ordered by the machine

  /// instruction used to create this SIMemOpInfo.

  SIAtomicAddrSpace getOrderingAddrSpace() const {

    return OrderingAddrSpace;

  }


  /// \returns Return true iff memory ordering of operations on

  /// different address spaces is required.

  bool getIsCrossAddressSpaceOrdering() const {

    return IsCrossAddressSpaceOrdering;

  }


  /// \returns True if memory access of the machine instruction used to

  /// create this SIMemOpInfo is volatile, false otherwise.

  bool isVolatile() const {

    return IsVolatile;

  }


  /// \returns True if memory access of the machine instruction used to

  /// create this SIMemOpInfo is nontemporal, false otherwise.

  bool isNonTemporal() const {

    return IsNonTemporal;

  }


  /// \returns True if memory access of the machine instruction used to

  /// create this SIMemOpInfo is last use, false otherwise.

  bool isLastUse() const { return IsLastUse; }


  /// \returns True if this is a cooperative load or store atomic.

  bool isCooperative() const { return IsCooperative; }


  /// \returns True if ordering constraint of the machine instruction used to

  /// create this SIMemOpInfo is unordered or higher, false otherwise.

  bool isAtomic() const {

    return Ordering != AtomicOrdering::NotAtomic;

  }


};


class SIMemOpAccess final {

private:

  const AMDGPUMachineModuleInfo *MMI = nullptr;

  const GCNSubtarget &ST;


  /// Reports unsupported message \p Msg for \p MI to LLVM context.

  void reportUnsupported(const MachineBasicBlock::iterator &MI,

                         const char *Msg) const;


  /// Inspects the target synchronization scope \p SSID and determines

  /// the SI atomic scope it corresponds to, the address spaces it

  /// covers, and whether the memory ordering applies between address

  /// spaces.

  std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>

  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;


  /// \return Return a bit set of the address spaces accessed by \p AS.

  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;


  /// \returns Info constructed from \p MI, which has at least machine memory

  /// operand.

  std::optional<SIMemOpInfo>

  constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;


public:

  /// Construct class to support accessing the machine memory operands

  /// of instructions in the machine function \p MF.

  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);


  /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.

  std::optional<SIMemOpInfo>

  getLoadInfo(const MachineBasicBlock::iterator &MI) const;


  /// \returns Store info if \p MI is a store operation, "std::nullopt"

  /// otherwise.

  std::optional<SIMemOpInfo>

  getStoreInfo(const MachineBasicBlock::iterator &MI) const;


  /// \returns Atomic fence info if \p MI is an atomic fence operation,

  /// "std::nullopt" otherwise.

  std::optional<SIMemOpInfo>

  getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;


  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or

  /// rmw operation, "std::nullopt" otherwise.

  std::optional<SIMemOpInfo>

  getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;

};


class SICacheControl {

protected:


  /// AMDGPU subtarget info.

  const GCNSubtarget &ST;


  /// Instruction info.

  const SIInstrInfo *TII = nullptr;


  IsaVersion IV;


  /// Whether to insert cache invalidating instructions.

  bool InsertCacheInv;


  SICacheControl(const GCNSubtarget &ST);


  /// Sets named bit \p BitName to "true" if present in instruction \p MI.

  /// \returns Returns true if \p MI is modified, false otherwise.

  bool enableNamedBit(const MachineBasicBlock::iterator MI,

                      AMDGPU::CPol::CPol Bit) const;


public:


  /// Create a cache control for the subtarget \p ST.

  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);


  /// Update \p MI memory load instruction to bypass any caches up to

  /// the \p Scope memory scope for address spaces \p

  /// AddrSpace. Return true iff the instruction was modified.

  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                                     SIAtomicScope Scope,

                                     SIAtomicAddrSpace AddrSpace) const = 0;


  /// Update \p MI memory store instruction to bypass any caches up to

  /// the \p Scope memory scope for address spaces \p

  /// AddrSpace. Return true iff the instruction was modified.

  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,

                                      SIAtomicScope Scope,

                                      SIAtomicAddrSpace AddrSpace) const = 0;


  /// Update \p MI memory read-modify-write instruction to bypass any caches up

  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true

  /// iff the instruction was modified.

  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,

                                    SIAtomicScope Scope,

                                    SIAtomicAddrSpace AddrSpace) const = 0;


  /// Update \p MI memory instruction of kind \p Op associated with address

  /// spaces \p AddrSpace to indicate it is volatile and/or

  /// nontemporal/last-use. Return true iff the instruction was modified.

  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                              SIAtomicAddrSpace AddrSpace,

                                              SIMemOp Op, bool IsVolatile,

                                              bool IsNonTemporal,

                                              bool IsLastUse = false) const = 0;


  virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {

    return false;

  };


  /// Handle cooperative load/store atomics.

  virtual bool handleCooperativeAtomic(MachineInstr &MI) const {

    llvm_unreachable(

        "cooperative atomics are not available on this architecture");

  }


  /// Inserts any necessary instructions at position \p Pos relative

  /// to instruction \p MI to ensure memory instructions before \p Pos of kind

  /// \p Op associated with address spaces \p AddrSpace have completed. Used

  /// between memory instructions to enforce the order they become visible as

  /// observed by other memory instructions executing in memory scope \p Scope.

  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between

  /// address spaces. Returns true iff any instructions inserted.

  virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                          SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                          bool IsCrossAddrSpaceOrdering, Position Pos,

                          AtomicOrdering Order) const = 0;


  /// Inserts any necessary instructions at position \p Pos relative to

  /// instruction \p MI to ensure any subsequent memory instructions of this

  /// thread with address spaces \p AddrSpace will observe the previous memory

  /// operations by any thread for memory scopes up to memory scope \p Scope .

  /// Returns true iff any instructions inserted.

  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace,

                             Position Pos) const = 0;


  /// Inserts any necessary instructions at position \p Pos relative to

  /// instruction \p MI to ensure previous memory instructions by this thread

  /// with address spaces \p AddrSpace have completed and can be observed by

  /// subsequent memory instructions by any thread executing in memory scope \p

  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is

  /// between address spaces. Returns true iff any instructions inserted.

  virtual bool insertRelease(MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace,

                             bool IsCrossAddrSpaceOrdering,

                             Position Pos) const = 0;


  /// Inserts any necessary instructions before the barrier start instruction

  /// \p MI in order to support pairing of barriers and fences.

  virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {

    return false;

  };


  /// Virtual destructor to allow derivations to be deleted.

  virtual ~SICacheControl() = default;

};


class SIGfx6CacheControl : public SICacheControl {

protected:


  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI

  /// is modified, false otherwise.

  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {

    return enableNamedBit(MI, AMDGPU::CPol::GLC);

  }


  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI

  /// is modified, false otherwise.

  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {

    return enableNamedBit(MI, AMDGPU::CPol::SLC);

  }


public:


  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}


  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace) const override;


  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,

                              SIAtomicScope Scope,

                              SIAtomicAddrSpace AddrSpace) const override;


  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,

                            SIAtomicScope Scope,

                            SIAtomicAddrSpace AddrSpace) const override;


  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsVolatile, bool IsNonTemporal,

                                      bool IsLastUse) const override;


  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                  bool IsCrossAddrSpaceOrdering, Position Pos,

                  AtomicOrdering Order) const override;


  bool insertAcquire(MachineBasicBlock::iterator &MI,

                     SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace,

                     Position Pos) const override;


  bool insertRelease(MachineBasicBlock::iterator &MI,

                     SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace,

                     bool IsCrossAddrSpaceOrdering,

                     Position Pos) const override;

};


class SIGfx7CacheControl : public SIGfx6CacheControl {

public:


  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}


  bool insertAcquire(MachineBasicBlock::iterator &MI,

                     SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace,

                     Position Pos) const override;


};


class SIGfx90ACacheControl : public SIGfx7CacheControl {

public:


  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}


  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace) const override;


  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,

                              SIAtomicScope Scope,

                              SIAtomicAddrSpace AddrSpace) const override;


  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,

                            SIAtomicScope Scope,

                            SIAtomicAddrSpace AddrSpace) const override;


  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsVolatile, bool IsNonTemporal,

                                      bool IsLastUse) const override;


  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                  bool IsCrossAddrSpaceOrdering, Position Pos,

                  AtomicOrdering Order) const override;


  bool insertAcquire(MachineBasicBlock::iterator &MI,

                     SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace,

                     Position Pos) const override;


  bool insertRelease(MachineBasicBlock::iterator &MI,

                     SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace,

                     bool IsCrossAddrSpaceOrdering,

                     Position Pos) const override;

};


class SIGfx940CacheControl : public SIGfx90ACacheControl {

protected:


  /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI

  /// is modified, false otherwise.

  bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {

    return enableNamedBit(MI, AMDGPU::CPol::SC0);

  }


  /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI

  /// is modified, false otherwise.

  bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {

    return enableNamedBit(MI, AMDGPU::CPol::SC1);

  }


  /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI

  /// is modified, false otherwise.

  bool enableNTBit(const MachineBasicBlock::iterator &MI) const {

    return enableNamedBit(MI, AMDGPU::CPol::NT);

  }


public:

  SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};


  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace) const override;


  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,

                              SIAtomicScope Scope,

                              SIAtomicAddrSpace AddrSpace) const override;


  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,

                            SIAtomicScope Scope,

                            SIAtomicAddrSpace AddrSpace) const override;


  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsVolatile, bool IsNonTemporal,

                                      bool IsLastUse) const override;


  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;


  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,

                     Position Pos) const override;

};


class SIGfx10CacheControl : public SIGfx7CacheControl {

protected:


  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI

  /// is modified, false otherwise.

  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {

    return enableNamedBit(MI, AMDGPU::CPol::DLC);

  }


public:


  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}


  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace) const override;


  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsVolatile, bool IsNonTemporal,

                                      bool IsLastUse) const override;


  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                  bool IsCrossAddrSpaceOrdering, Position Pos,

                  AtomicOrdering Order) const override;


  bool insertAcquire(MachineBasicBlock::iterator &MI,

                     SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace,

                     Position Pos) const override;


  bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;

};


class SIGfx11CacheControl : public SIGfx10CacheControl {

public:

  SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}


  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace) const override;


  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsVolatile, bool IsNonTemporal,

                                      bool IsLastUse) const override;

};


class SIGfx12CacheControl : public SIGfx11CacheControl {

protected:

  // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.

  // \returns Returns true if \p MI is modified, false otherwise.

  bool setTH(const MachineBasicBlock::iterator MI,

             AMDGPU::CPol::CPol Value) const;

  // Sets Scope policy to \p Value if CPol operand is present in instruction \p

  // MI. \returns Returns true if \p MI is modified, false otherwise.

  bool setScope(const MachineBasicBlock::iterator MI,

                AMDGPU::CPol::CPol Value) const;


  // Stores with system scope (SCOPE_SYS) need to wait for:

  // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0

  // - non-returning-atomics       - wait for STORECNT==0

  //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits

  //   since it does not distinguish atomics-with-return from regular stores.

  // There is no need to wait if memory is cached (mtype != UC).

  bool

  insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;


  bool setAtomicScope(const MachineBasicBlock::iterator &MI,

                      SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;


public:

  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {

    // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases

    // the behavior is the same if assuming GFX12.0 in CU mode.

    assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());

  }


  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                  bool IsCrossAddrSpaceOrdering, Position Pos,

                  AtomicOrdering Order) const override;


  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;


  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsVolatile, bool IsNonTemporal,

                                      bool IsLastUse) const override;


  bool finalizeStore(MachineInstr &MI, bool Atomic) const override;


  virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;


  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

                     SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,

                     Position Pos) const override;


  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

                             SIAtomicScope Scope,

                             SIAtomicAddrSpace AddrSpace) const override {

    return setAtomicScope(MI, Scope, AddrSpace);

  }


  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,

                              SIAtomicScope Scope,

                              SIAtomicAddrSpace AddrSpace) const override {

    return setAtomicScope(MI, Scope, AddrSpace);

  }


  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,

                            SIAtomicScope Scope,

                            SIAtomicAddrSpace AddrSpace) const override {

    return setAtomicScope(MI, Scope, AddrSpace);

  }

};


class SIMemoryLegalizer final {

private:

  const MachineModuleInfo &MMI;

  /// Cache Control.

  std::unique_ptr<SICacheControl> CC = nullptr;


  /// List of atomic pseudo instructions.

  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;


  /// Return true iff instruction \p MI is a atomic instruction that

  /// returns a result.

  bool isAtomicRet(const MachineInstr &MI) const {

    return SIInstrInfo::isAtomicRet(MI);

  }


  /// Removes all processed atomic pseudo instructions from the current

  /// function. Returns true if current function is modified, false otherwise.

  bool removeAtomicPseudoMIs();


  /// Expands load operation \p MI. Returns true if instructions are

  /// added/deleted or \p MI is modified, false otherwise.

  bool expandLoad(const SIMemOpInfo &MOI,

                  MachineBasicBlock::iterator &MI);

  /// Expands store operation \p MI. Returns true if instructions are

  /// added/deleted or \p MI is modified, false otherwise.

  bool expandStore(const SIMemOpInfo &MOI,

                   MachineBasicBlock::iterator &MI);

  /// Expands atomic fence operation \p MI. Returns true if

  /// instructions are added/deleted or \p MI is modified, false otherwise.

  bool expandAtomicFence(const SIMemOpInfo &MOI,

                         MachineBasicBlock::iterator &MI);

  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if

  /// instructions are added/deleted or \p MI is modified, false otherwise.

  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,

                                MachineBasicBlock::iterator &MI);


public:

  SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};

  bool run(MachineFunction &MF);

};


class SIMemoryLegalizerLegacy final : public MachineFunctionPass {

public:

  static char ID;


  SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    MachineFunctionPass::getAnalysisUsage(AU);

  }


  StringRef getPassName() const override {

    return PASS_NAME;

  }


  bool runOnMachineFunction(MachineFunction &MF) override;

};


static const StringMap<SIAtomicAddrSpace> ASNames = {{

    {"global", SIAtomicAddrSpace::GLOBAL},

    {"local", SIAtomicAddrSpace::LDS},

}};


void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {

  const MachineFunction *MF = MI.getMF();

  const Function &Fn = MF->getFunction();

  SmallString<128> Str;

  raw_svector_ostream OS(Str);

  OS << "unknown address space '" << AS << "'; expected one of ";

  ListSeparator LS;

  for (const auto &[Name, Val] : ASNames)

    OS << LS << '\'' << Name << '\'';

  Fn.getContext().diagnose(

      DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));

}


/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.

/// If this tag isn't present, or if it has no meaningful values, returns

/// \p none, otherwise returns the address spaces specified by the MD.

static std::optional<SIAtomicAddrSpace>

getSynchronizeAddrSpaceMD(const MachineInstr &MI) {

  static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";


  auto MMRA = MMRAMetadata(MI.getMMRAMetadata());

  if (!MMRA)

    return std::nullopt;


  SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;

  for (const auto &[Prefix, Suffix] : MMRA) {

    if (Prefix != FenceASPrefix)

      continue;


    if (auto It = ASNames.find(Suffix); It != ASNames.end())

      Result |= It->second;

    else

      diagnoseUnknownMMRAASName(MI, Suffix);

  }


  if (Result == SIAtomicAddrSpace::NONE)

    return std::nullopt;


  return Result;

}


} // end anonymous namespace


void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,

                                      const char *Msg) const {

  const Function &Func = MI->getParent()->getParent()->getFunction();

  Func.getContext().diagnose(

      DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));

}


std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>

SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,

                               SIAtomicAddrSpace InstrAddrSpace) const {

  if (SSID == SyncScope::System)

    return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);

  if (SSID == MMI->getAgentSSID())

    return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);

  if (SSID == MMI->getClusterSSID())

    return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);

  if (SSID == MMI->getWorkgroupSSID())

    return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,

                      true);

  if (SSID == MMI->getWavefrontSSID())

    return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,

                      true);

  if (SSID == SyncScope::SingleThread)

    return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,

                      true);

  if (SSID == MMI->getSystemOneAddressSpaceSSID())

    return std::tuple(SIAtomicScope::SYSTEM,

                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);

  if (SSID == MMI->getAgentOneAddressSpaceSSID())

    return std::tuple(SIAtomicScope::AGENT,

                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);

  if (SSID == MMI->getClusterOneAddressSpaceSSID())

    return std::tuple(SIAtomicScope::CLUSTER,

                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);

  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())

    return std::tuple(SIAtomicScope::WORKGROUP,

                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);

  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())

    return std::tuple(SIAtomicScope::WAVEFRONT,

                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);

  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())

    return std::tuple(SIAtomicScope::SINGLETHREAD,

                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);

  return std::nullopt;

}


SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {

  if (AS == AMDGPUAS::FLAT_ADDRESS)

    return SIAtomicAddrSpace::FLAT;

  if (AS == AMDGPUAS::GLOBAL_ADDRESS)

    return SIAtomicAddrSpace::GLOBAL;

  if (AS == AMDGPUAS::LOCAL_ADDRESS)

    return SIAtomicAddrSpace::LDS;

  if (AS == AMDGPUAS::PRIVATE_ADDRESS)

    return SIAtomicAddrSpace::SCRATCH;

  if (AS == AMDGPUAS::REGION_ADDRESS)

    return SIAtomicAddrSpace::GDS;


  return SIAtomicAddrSpace::OTHER;

}


SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,

                             const GCNSubtarget &ST)

    : MMI(&MMI_), ST(ST) {}


std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(

    const MachineBasicBlock::iterator &MI) const {

  assert(MI->getNumMemOperands() > 0);


  SyncScope::ID SSID = SyncScope::SingleThread;

  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;

  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;

  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;

  bool IsNonTemporal = true;

  bool IsVolatile = false;

  bool IsLastUse = false;

  bool IsCooperative = false;


  // Validator should check whether or not MMOs cover the entire set of

  // locations accessed by the memory instruction.

  for (const auto &MMO : MI->memoperands()) {

    IsNonTemporal &= MMO->isNonTemporal();

    IsVolatile |= MMO->isVolatile();

    IsLastUse |= MMO->getFlags() & MOLastUse;

    IsCooperative |= MMO->getFlags() & MOCooperative;

    InstrAddrSpace |=

      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());

    AtomicOrdering OpOrdering = MMO->getSuccessOrdering();

    if (OpOrdering != AtomicOrdering::NotAtomic) {

      const auto &IsSyncScopeInclusion =

          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());

      if (!IsSyncScopeInclusion) {

        reportUnsupported(MI,

          "Unsupported non-inclusive atomic synchronization scope");

        return std::nullopt;

      }


      SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();

      Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);

      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&

             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);

      FailureOrdering =

          getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());

    }

  }


  SIAtomicScope Scope = SIAtomicScope::NONE;

  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;

  bool IsCrossAddressSpaceOrdering = false;

  if (Ordering != AtomicOrdering::NotAtomic) {

    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);

    if (!ScopeOrNone) {

      reportUnsupported(MI, "Unsupported atomic synchronization scope");

      return std::nullopt;

    }

    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =

        *ScopeOrNone;

    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||

        ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||

        ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {

      reportUnsupported(MI, "Unsupported atomic address space");

      return std::nullopt;

    }

  }

  return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,

                     IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,

                     IsNonTemporal, IsLastUse, IsCooperative);

}


std::optional<SIMemOpInfo>

SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {

  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);


  if (!(MI->mayLoad() && !MI->mayStore()))

    return std::nullopt;


  // Be conservative if there are no memory operands.

  if (MI->getNumMemOperands() == 0)

    return SIMemOpInfo(ST);


  return constructFromMIWithMMO(MI);

}


std::optional<SIMemOpInfo>

SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {

  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);


  if (!(!MI->mayLoad() && MI->mayStore()))

    return std::nullopt;


  // Be conservative if there are no memory operands.

  if (MI->getNumMemOperands() == 0)

    return SIMemOpInfo(ST);


  return constructFromMIWithMMO(MI);

}


std::optional<SIMemOpInfo>

SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {

  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);


  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)

    return std::nullopt;


  AtomicOrdering Ordering =

    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());


  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());

  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);

  if (!ScopeOrNone) {

    reportUnsupported(MI, "Unsupported atomic synchronization scope");

    return std::nullopt;

  }


  SIAtomicScope Scope = SIAtomicScope::NONE;

  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;

  bool IsCrossAddressSpaceOrdering = false;

  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =

      *ScopeOrNone;


  if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {

    // We currently expect refineOrderingAS to be the only place that

    // can refine the AS ordered by the fence.

    // If that changes, we need to review the semantics of that function

    // in case it needs to preserve certain address spaces.

    reportUnsupported(MI, "Unsupported atomic address space");

    return std::nullopt;

  }


  auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);

  if (SynchronizeAS)

    OrderingAddrSpace = *SynchronizeAS;


  return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,

                     SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,

                     AtomicOrdering::NotAtomic);

}


std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(

    const MachineBasicBlock::iterator &MI) const {

  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);


  if (!(MI->mayLoad() && MI->mayStore()))

    return std::nullopt;


  // Be conservative if there are no memory operands.

  if (MI->getNumMemOperands() == 0)

    return SIMemOpInfo(ST);


  return constructFromMIWithMMO(MI);

}


SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {

  TII = ST.getInstrInfo();

  IV = getIsaVersion(ST.getCPU());

  InsertCacheInv = !AmdgcnSkipCacheInvalidations;

}


bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,

                                    AMDGPU::CPol::CPol Bit) const {

  MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);

  if (!CPol)

    return false;


  CPol->setImm(CPol->getImm() | Bit);

  return true;

}


/* static */

std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {

  GCNSubtarget::Generation Generation = ST.getGeneration();

  if (ST.hasGFX940Insts())

    return std::make_unique<SIGfx940CacheControl>(ST);

  if (ST.hasGFX90AInsts())

    return std::make_unique<SIGfx90ACacheControl>(ST);

  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)

    return std::make_unique<SIGfx6CacheControl>(ST);

  if (Generation < AMDGPUSubtarget::GFX10)

    return std::make_unique<SIGfx7CacheControl>(ST);

  if (Generation < AMDGPUSubtarget::GFX11)

    return std::make_unique<SIGfx10CacheControl>(ST);

  if (Generation < AMDGPUSubtarget::GFX12)

    return std::make_unique<SIGfx11CacheControl>(ST);

  return std::make_unique<SIGfx12CacheControl>(ST);

}


bool SIGfx6CacheControl::enableLoadCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && !MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // Set L1 cache policy to MISS_EVICT.

      // Note: there is no L2 cache bypass policy at the ISA level.

      Changed |= enableGLCBit(MI);

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx6CacheControl::enableStoreCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(!MI->mayLoad() && MI->mayStore());

  bool Changed = false;


  /// The L1 cache is write through so does not need to be bypassed. There is no

  /// bypass control for the L2 cache at the isa level.


  return Changed;

}


bool SIGfx6CacheControl::enableRMWCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && MI->mayStore());

  bool Changed = false;


  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically

  /// bypassed, and the GLC bit is instead used to indicate if they are

  /// return or no-return.

  /// Note: there is no L2 cache coherent bypass control at the ISA level.


  return Changed;

}


bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(

    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

  // Only handle load and store, not atomic read-modify-write insructions. The

  // latter use glc to indicate if the atomic returns a result and so must not

  // be used for cache control.

  assert(MI->mayLoad() ^ MI->mayStore());


  // Only update load and store, not LLVM IR atomic read-modify-write

  // instructions. The latter are always marked as volatile so cannot sensibly

  // handle it as do not want to pessimize all atomics. Also they do not support

  // the nontemporal attribute.

  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);


  bool Changed = false;


  if (IsVolatile) {

    // Set L1 cache policy to be MISS_EVICT for load instructions

    // and MISS_LRU for store instructions.

    // Note: there is no L2 cache bypass policy at the ISA level.

    if (Op == SIMemOp::LOAD)

      Changed |= enableGLCBit(MI);


    // Ensure operation has completed at system scope to cause all volatile

    // operations to be visible outside the program in a global order. Do not

    // request cross address space as only the global address space can be

    // observable outside the program, so no need to cause a waitcnt for LDS

    // address space operations.

    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

                          Position::AFTER, AtomicOrdering::Unordered);


    return Changed;

  }


  if (IsNonTemporal) {

    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT

    // for both loads and stores, and the L2 cache policy to STREAM.

    Changed |= enableGLCBit(MI);

    Changed |= enableSLCBit(MI);

    return Changed;

  }


  return Changed;

}


bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,

                                    SIAtomicScope Scope,

                                    SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                    bool IsCrossAddrSpaceOrdering, Position Pos,

                                    AtomicOrdering Order) const {

  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  bool VMCnt = false;

  bool LGKMCnt = false;


  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=

      SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      VMCnt |= true;

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The L1 cache keeps all memory operations in order for

      // wavefronts in the same work-group.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

    case SIAtomicScope::WORKGROUP:

      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is

      // not needed as LDS operations for all waves are executed in a total

      // global ordering as observed by all waves. Required if also

      // synchronizing with global/GDS memory as LDS operations could be

      // reordered with respect to later global/GDS memory operations of the

      // same wave.

      LGKMCnt |= IsCrossAddrSpaceOrdering;

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The LDS keeps all memory operations in order for

      // the same wavefront.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"

      // is not needed as GDS operations for all waves are executed in a total

      // global ordering as observed by all waves. Required if also

      // synchronizing with global/LDS memory as GDS operations could be

      // reordered with respect to later global/LDS memory operations of the

      // same wave.

      LGKMCnt |= IsCrossAddrSpaceOrdering;

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The GDS keeps all memory operations in order for

      // the same work-group.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if (VMCnt || LGKMCnt) {

    unsigned WaitCntImmediate =

      AMDGPU::encodeWaitcnt(IV,

                            VMCnt ? 0 : getVmcntBitMask(IV),

                            getExpcntBitMask(IV),

                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))

        .addImm(WaitCntImmediate);

    Changed = true;

  }


  // On architectures that support direct loads to LDS, emit an unknown waitcnt

  // at workgroup-scoped release operations that specify the LDS address space.

  // SIInsertWaitcnts will later replace this with a vmcnt().

  if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&

      Scope == SIAtomicScope::WORKGROUP &&

      (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));

    Changed = true;

  }


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

                                       SIAtomicScope Scope,

                                       SIAtomicAddrSpace AddrSpace,

                                       Position Pos) const {

  if (!InsertCacheInv)

    return false;


  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));

      Changed = true;

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to invalidate.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory cache

  /// to be flushed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,

                                       SIAtomicScope Scope,

                                       SIAtomicAddrSpace AddrSpace,

                                       bool IsCrossAddrSpaceOrdering,

                                       Position Pos) const {

  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,

                    IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);

}


bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

                                       SIAtomicScope Scope,

                                       SIAtomicAddrSpace AddrSpace,

                                       Position Pos) const {

  if (!InsertCacheInv)

    return false;


  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();


  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()

                                    ? AMDGPU::BUFFER_WBINVL1

                                    : AMDGPU::BUFFER_WBINVL1_VOL;


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      BuildMI(MBB, MI, DL, TII->get(InvalidateL1));

      Changed = true;

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to invalidate.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory cache

  /// to be flushed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx90ACacheControl::enableLoadCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && !MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // Set the L1 cache policy to MISS_LRU.

      // Note: there is no L2 cache bypass policy at the ISA level.

      Changed |= enableGLCBit(MI);

      break;

    case SIAtomicScope::WORKGROUP:

      // In threadgroup split mode the waves of a work-group can be executing on

      // different CUs. Therefore need to bypass the L1 which is per CU.

      // Otherwise in non-threadgroup split mode all waves of a work-group are

      // on the same CU, and so the L1 does not need to be bypassed.

      if (ST.isTgSplitEnabled())

        Changed |= enableGLCBit(MI);

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx90ACacheControl::enableStoreCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(!MI->mayLoad() && MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      /// Do not set glc for store atomic operations as they implicitly write

      /// through the L1 cache.

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass. Store atomics implicitly write through the L1

      // cache.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx90ACacheControl::enableRMWCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      /// Do not set glc for RMW atomic operations as they implicitly bypass

      /// the L1 cache, and the glc bit is instead used to indicate if they are

      /// return or no-return.

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass. RMW atomics implicitly bypass the L1 cache.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  return Changed;

}


bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(

    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

  // Only handle load and store, not atomic read-modify-write insructions. The

  // latter use glc to indicate if the atomic returns a result and so must not

  // be used for cache control.

  assert(MI->mayLoad() ^ MI->mayStore());


  // Only update load and store, not LLVM IR atomic read-modify-write

  // instructions. The latter are always marked as volatile so cannot sensibly

  // handle it as do not want to pessimize all atomics. Also they do not support

  // the nontemporal attribute.

  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);


  bool Changed = false;


  if (IsVolatile) {

    // Set L1 cache policy to be MISS_EVICT for load instructions

    // and MISS_LRU for store instructions.

    // Note: there is no L2 cache bypass policy at the ISA level.

    if (Op == SIMemOp::LOAD)

      Changed |= enableGLCBit(MI);


    // Ensure operation has completed at system scope to cause all volatile

    // operations to be visible outside the program in a global order. Do not

    // request cross address space as only the global address space can be

    // observable outside the program, so no need to cause a waitcnt for LDS

    // address space operations.

    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

                          Position::AFTER, AtomicOrdering::Unordered);


    return Changed;

  }


  if (IsNonTemporal) {

    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT

    // for both loads and stores, and the L2 cache policy to STREAM.

    Changed |= enableGLCBit(MI);

    Changed |= enableSLCBit(MI);

    return Changed;

  }


  return Changed;

}


bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,

                                      SIAtomicScope Scope,

                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                      bool IsCrossAddrSpaceOrdering,

                                      Position Pos,

                                      AtomicOrdering Order) const {

  if (ST.isTgSplitEnabled()) {

    // In threadgroup split mode the waves of a work-group can be executing on

    // different CUs. Therefore need to wait for global or GDS memory operations

    // to complete to ensure they are visible to waves in the other CUs.

    // Otherwise in non-threadgroup split mode all waves of a work-group are on

    // the same CU, so no need to wait for global memory as all waves in the

    // work-group access the same the L1, nor wait for GDS as access are ordered

    // on a CU.

    if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |

                       SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&

        (Scope == SIAtomicScope::WORKGROUP)) {

      // Same as GFX7 using agent scope.

      Scope = SIAtomicScope::AGENT;

    }

    // In threadgroup split mode LDS cannot be allocated so no need to wait for

    // LDS memory operations.

    AddrSpace &= ~SIAtomicAddrSpace::LDS;

  }

  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,

                                        IsCrossAddrSpaceOrdering, Pos, Order);

}


bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

                                         SIAtomicScope Scope,

                                         SIAtomicAddrSpace AddrSpace,

                                         Position Pos) const {

  if (!InsertCacheInv)

    return false;


  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Ensures that following loads will not see stale remote VMEM data or

      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and

      // CC will never be stale due to the local memory probes.

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));

      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the

      // hardware does not reorder memory operations by the same wave with

      // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to

      // remove any cache lines of earlier writes by the same wave and ensures

      // later reads by the same wave will refetch the cache lines.

      Changed = true;

      break;

    case SIAtomicScope::AGENT:

      // Same as GFX7.

      break;

    case SIAtomicScope::WORKGROUP:

      // In threadgroup split mode the waves of a work-group can be executing on

      // different CUs. Therefore need to invalidate the L1 which is per CU.

      // Otherwise in non-threadgroup split mode all waves of a work-group are

      // on the same CU, and so the L1 does not need to be invalidated.

      if (ST.isTgSplitEnabled()) {

        // Same as GFX7 using agent scope.

        Scope = SIAtomicScope::AGENT;

      }

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // Same as GFX7.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory cache

  /// to be flushed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  if (Pos == Position::AFTER)

    --MI;


  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);


  return Changed;

}


bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,

                                         SIAtomicScope Scope,

                                         SIAtomicAddrSpace AddrSpace,

                                         bool IsCrossAddrSpaceOrdering,

                                         Position Pos) const {

  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  const DebugLoc &DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the

      // hardware does not reorder memory operations by the same wave with

      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed

      // to initiate writeback of any dirty cache lines of earlier writes by the

      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the

      // writeback has completed.

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))

        // Set SC bits to indicate system scope.

        .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);

      // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT

      // vmcnt(0)" needed by the "BUFFER_WBL2".

      Changed = true;

      break;

    case SIAtomicScope::AGENT:

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // Same as GFX7.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if (Pos == Position::AFTER)

    --MI;


  Changed |=

      SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,

                                        IsCrossAddrSpaceOrdering, Pos);


  return Changed;

}


bool SIGfx940CacheControl::enableLoadCacheBypass(

    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && !MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Set SC bits to indicate system scope.

      Changed |= enableSC0Bit(MI);

      Changed |= enableSC1Bit(MI);

      break;

    case SIAtomicScope::AGENT:

      // Set SC bits to indicate agent scope.

      Changed |= enableSC1Bit(MI);

      break;

    case SIAtomicScope::WORKGROUP:

      // In threadgroup split mode the waves of a work-group can be executing on

      // different CUs. Therefore need to bypass the L1 which is per CU.

      // Otherwise in non-threadgroup split mode all waves of a work-group are

      // on the same CU, and so the L1 does not need to be bypassed. Setting SC

      // bits to indicate work-group scope will do this automatically.

      Changed |= enableSC0Bit(MI);

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // Leave SC bits unset to indicate wavefront scope.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx940CacheControl::enableStoreCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {

  assert(!MI->mayLoad() && MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Set SC bits to indicate system scope.

      Changed |= enableSC0Bit(MI);

      Changed |= enableSC1Bit(MI);

      break;

    case SIAtomicScope::AGENT:

      // Set SC bits to indicate agent scope.

      Changed |= enableSC1Bit(MI);

      break;

    case SIAtomicScope::WORKGROUP:

      // Set SC bits to indicate workgroup scope.

      Changed |= enableSC0Bit(MI);

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // Leave SC bits unset to indicate wavefront scope.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx940CacheControl::enableRMWCacheBypass(

    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Set SC1 bit to indicate system scope.

      Changed |= enableSC1Bit(MI);

      break;

    case SIAtomicScope::AGENT:

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // RMW atomic operations implicitly bypass the L1 cache and only use SC1

      // to indicate system or agent scope. The SC0 bit is used to indicate if

      // they are return or no-return. Leave SC1 bit unset to indicate agent

      // scope.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  return Changed;

}


bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(

    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

  // Only handle load and store, not atomic read-modify-write insructions. The

  // latter use glc to indicate if the atomic returns a result and so must not

  // be used for cache control.

  assert(MI->mayLoad() ^ MI->mayStore());


  // Only update load and store, not LLVM IR atomic read-modify-write

  // instructions. The latter are always marked as volatile so cannot sensibly

  // handle it as do not want to pessimize all atomics. Also they do not support

  // the nontemporal attribute.

  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);


  bool Changed = false;


  if (IsVolatile) {

    // Set SC bits to indicate system scope.

    Changed |= enableSC0Bit(MI);

    Changed |= enableSC1Bit(MI);


    // Ensure operation has completed at system scope to cause all volatile

    // operations to be visible outside the program in a global order. Do not

    // request cross address space as only the global address space can be

    // observable outside the program, so no need to cause a waitcnt for LDS

    // address space operations.

    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

                          Position::AFTER, AtomicOrdering::Unordered);


    return Changed;

  }


  if (IsNonTemporal) {

    Changed |= enableNTBit(MI);

    return Changed;

  }


  return Changed;

}


bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

                                         SIAtomicScope Scope,

                                         SIAtomicAddrSpace AddrSpace,

                                         Position Pos) const {

  if (!InsertCacheInv)

    return false;


  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Ensures that following loads will not see stale remote VMEM data or

      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and

      // CC will never be stale due to the local memory probes.

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))

          // Set SC bits to indicate system scope.

          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);

      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the

      // hardware does not reorder memory operations by the same wave with

      // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to

      // remove any cache lines of earlier writes by the same wave and ensures

      // later reads by the same wave will refetch the cache lines.

      Changed = true;

      break;

    case SIAtomicScope::AGENT:

      // Ensures that following loads will not see stale remote date or local

      // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale

      // due to the memory probes.

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))

          // Set SC bits to indicate agent scope.

          .addImm(AMDGPU::CPol::SC1);

      // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware

      // does not reorder memory operations with respect to preceeding buffer

      // invalidate. The invalidate is guaranteed to remove any cache lines of

      // earlier writes and ensures later writes will refetch the cache lines.

      Changed = true;

      break;

    case SIAtomicScope::WORKGROUP:

      // In threadgroup split mode the waves of a work-group can be executing on

      // different CUs. Therefore need to invalidate the L1 which is per CU.

      // Otherwise in non-threadgroup split mode all waves of a work-group are

      // on the same CU, and so the L1 does not need to be invalidated.

      if (ST.isTgSplitEnabled()) {

        // Ensures L1 is invalidated if in threadgroup split mode. In

        // non-threadgroup split mode it is a NOP, but no point generating it in

        // that case if know not in that mode.

        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))

            // Set SC bits to indicate work-group scope.

            .addImm(AMDGPU::CPol::SC0);

        // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware

        // does not reorder memory operations with respect to preceeding buffer

        // invalidate. The invalidate is guaranteed to remove any cache lines of

        // earlier writes and ensures later writes will refetch the cache lines.

        Changed = true;

      }

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // Could generate "BUFFER_INV" but it would do nothing as there are no

      // caches to invalidate.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory cache

  /// to be flushed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,

                                         SIAtomicScope Scope,

                                         SIAtomicAddrSpace AddrSpace,

                                         bool IsCrossAddrSpaceOrdering,

                                         Position Pos) const {

  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the

      // hardware does not reorder memory operations by the same wave with

      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed

      // to initiate writeback of any dirty cache lines of earlier writes by the

      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the

      // writeback has completed.

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))

          // Set SC bits to indicate system scope.

          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);

      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is

      // SIAtomicScope::SYSTEM, the following insertWait will generate the

      // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".

      Changed = true;

      break;

    case SIAtomicScope::AGENT:

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))

          // Set SC bits to indicate agent scope.

          .addImm(AMDGPU::CPol::SC1);


      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is

      // SIAtomicScope::AGENT, the following insertWait will generate the

      // required "S_WAITCNT vmcnt(0)".

      Changed = true;

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // Do not generate "BUFFER_WBL2" as there are no caches it would

      // writeback, and would require an otherwise unnecessary

      // "S_WAITCNT vmcnt(0)".

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if (Pos == Position::AFTER)

    --MI;


  // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other

  // S_WAITCNT needed.

  Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,

                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);


  return Changed;

}


bool SIGfx10CacheControl::enableLoadCacheBypass(

    const MachineBasicBlock::iterator &MI,

    SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && !MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // Set the L0 and L1 cache policies to MISS_EVICT.

      // Note: there is no L2 cache coherent bypass control at the ISA level.

      Changed |= enableGLCBit(MI);

      Changed |= enableDLCBit(MI);

      break;

    case SIAtomicScope::WORKGROUP:

      // In WGP mode the waves of a work-group can be executing on either CU of

      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in

      // CU mode all waves of a work-group are on the same CU, and so the L0

      // does not need to be bypassed.

      if (!ST.isCuModeEnabled())

        Changed |= enableGLCBit(MI);

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(

    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {


  // Only handle load and store, not atomic read-modify-write insructions. The

  // latter use glc to indicate if the atomic returns a result and so must not

  // be used for cache control.

  assert(MI->mayLoad() ^ MI->mayStore());


  // Only update load and store, not LLVM IR atomic read-modify-write

  // instructions. The latter are always marked as volatile so cannot sensibly

  // handle it as do not want to pessimize all atomics. Also they do not support

  // the nontemporal attribute.

  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);


  bool Changed = false;


  if (IsVolatile) {

    // Set L0 and L1 cache policy to be MISS_EVICT for load instructions

    // and MISS_LRU for store instructions.

    // Note: there is no L2 cache coherent bypass control at the ISA level.

    if (Op == SIMemOp::LOAD) {

      Changed |= enableGLCBit(MI);

      Changed |= enableDLCBit(MI);

    }


    // Ensure operation has completed at system scope to cause all volatile

    // operations to be visible outside the program in a global order. Do not

    // request cross address space as only the global address space can be

    // observable outside the program, so no need to cause a waitcnt for LDS

    // address space operations.

    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

                          Position::AFTER, AtomicOrdering::Unordered);

    return Changed;

  }


  if (IsNonTemporal) {

    // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT

    // and L2 cache policy to STREAM.

    // For stores setting both GLC and SLC configures L0 and L1 cache policy

    // to MISS_EVICT and the L2 cache policy to STREAM.

    if (Op == SIMemOp::STORE)

      Changed |= enableGLCBit(MI);

    Changed |= enableSLCBit(MI);


    return Changed;

  }


  return Changed;

}


bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,

                                     SIAtomicScope Scope,

                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                     bool IsCrossAddrSpaceOrdering,

                                     Position Pos, AtomicOrdering Order) const {

  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  bool VMCnt = false;

  bool VSCnt = false;

  bool LGKMCnt = false;


  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=

      SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)

        VMCnt |= true;

      if ((Op & SIMemOp::STORE) != SIMemOp::NONE)

        VSCnt |= true;

      break;

    case SIAtomicScope::WORKGROUP:

      // In WGP mode the waves of a work-group can be executing on either CU of

      // the WGP. Therefore need to wait for operations to complete to ensure

      // they are visible to waves in the other CU as the L0 is per CU.

      // Otherwise in CU mode and all waves of a work-group are on the same CU

      // which shares the same L0.

      if (!ST.isCuModeEnabled()) {

        if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)

          VMCnt |= true;

        if ((Op & SIMemOp::STORE) != SIMemOp::NONE)

          VSCnt |= true;

      }

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The L0 cache keeps all memory operations in order for

      // work-items in the same wavefront.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

    case SIAtomicScope::WORKGROUP:

      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is

      // not needed as LDS operations for all waves are executed in a total

      // global ordering as observed by all waves. Required if also

      // synchronizing with global/GDS memory as LDS operations could be

      // reordered with respect to later global/GDS memory operations of the

      // same wave.

      LGKMCnt |= IsCrossAddrSpaceOrdering;

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The LDS keeps all memory operations in order for

      // the same wavefront.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"

      // is not needed as GDS operations for all waves are executed in a total

      // global ordering as observed by all waves. Required if also

      // synchronizing with global/LDS memory as GDS operations could be

      // reordered with respect to later global/LDS memory operations of the

      // same wave.

      LGKMCnt |= IsCrossAddrSpaceOrdering;

      break;

    case SIAtomicScope::WORKGROUP:

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The GDS keeps all memory operations in order for

      // the same work-group.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if (VMCnt || LGKMCnt) {

    unsigned WaitCntImmediate =

      AMDGPU::encodeWaitcnt(IV,

                            VMCnt ? 0 : getVmcntBitMask(IV),

                            getExpcntBitMask(IV),

                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))

        .addImm(WaitCntImmediate);

    Changed = true;

  }


  // On architectures that support direct loads to LDS, emit an unknown waitcnt

  // at workgroup-scoped release operations that specify the LDS address space.

  // SIInsertWaitcnts will later replace this with a vmcnt().

  if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&

      Scope == SIAtomicScope::WORKGROUP &&

      (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));

    Changed = true;

  }


  if (VSCnt) {

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))

        .addReg(AMDGPU::SGPR_NULL, RegState::Undef)

        .addImm(0);

    Changed = true;

  }


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

                                        SIAtomicScope Scope,

                                        SIAtomicAddrSpace AddrSpace,

                                        Position Pos) const {

  if (!InsertCacheInv)

    return false;


  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // The order of invalidates matter here. We must invalidate "outer in"

      // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is

      // invalidated.

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));

      Changed = true;

      break;

    case SIAtomicScope::WORKGROUP:

      // In WGP mode the waves of a work-group can be executing on either CU of

      // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise

      // in CU mode and all waves of a work-group are on the same CU, and so the

      // L0 does not need to be invalidated.

      if (!ST.isCuModeEnabled()) {

        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));

        Changed = true;

      }

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to invalidate.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory cache

  /// to be flushed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx10CacheControl::insertBarrierStart(

    MachineBasicBlock::iterator &MI) const {

  // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU

  // mode. This is because a CU mode release fence does not emit any wait, which

  // is fine when only dealing with vmem, but isn't sufficient in the presence

  // of barriers which do not go through vmem.

  // GFX12.5 does not require this additional wait.

  if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())

    return false;


  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

          TII->get(AMDGPU::S_WAITCNT_DEPCTR))

      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));

  return true;

}


bool SIGfx11CacheControl::enableLoadCacheBypass(

    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

    SIAtomicAddrSpace AddrSpace) const {

  assert(MI->mayLoad() && !MI->mayStore());

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

      // Set the L0 and L1 cache policies to MISS_EVICT.

      // Note: there is no L2 cache coherent bypass control at the ISA level.

      Changed |= enableGLCBit(MI);

      break;

    case SIAtomicScope::WORKGROUP:

      // In WGP mode the waves of a work-group can be executing on either CU of

      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in

      // CU mode all waves of a work-group are on the same CU, and so the L0

      // does not need to be bypassed.

      if (!ST.isCuModeEnabled())

        Changed |= enableGLCBit(MI);

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  /// The scratch address space does not need the global memory caches

  /// to be bypassed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.


  return Changed;

}


bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(

    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {


  // Only handle load and store, not atomic read-modify-write insructions. The

  // latter use glc to indicate if the atomic returns a result and so must not

  // be used for cache control.

  assert(MI->mayLoad() ^ MI->mayStore());


  // Only update load and store, not LLVM IR atomic read-modify-write

  // instructions. The latter are always marked as volatile so cannot sensibly

  // handle it as do not want to pessimize all atomics. Also they do not support

  // the nontemporal attribute.

  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);


  bool Changed = false;


  if (IsVolatile) {

    // Set L0 and L1 cache policy to be MISS_EVICT for load instructions

    // and MISS_LRU for store instructions.

    // Note: there is no L2 cache coherent bypass control at the ISA level.

    if (Op == SIMemOp::LOAD)

      Changed |= enableGLCBit(MI);


    // Set MALL NOALLOC for load and store instructions.

    Changed |= enableDLCBit(MI);


    // Ensure operation has completed at system scope to cause all volatile

    // operations to be visible outside the program in a global order. Do not

    // request cross address space as only the global address space can be

    // observable outside the program, so no need to cause a waitcnt for LDS

    // address space operations.

    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

                          Position::AFTER, AtomicOrdering::Unordered);

    return Changed;

  }


  if (IsNonTemporal) {

    // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT

    // and L2 cache policy to STREAM.

    // For stores setting both GLC and SLC configures L0 and L1 cache policy

    // to MISS_EVICT and the L2 cache policy to STREAM.

    if (Op == SIMemOp::STORE)

      Changed |= enableGLCBit(MI);

    Changed |= enableSLCBit(MI);


    // Set MALL NOALLOC for load and store instructions.

    Changed |= enableDLCBit(MI);

    return Changed;

  }


  return Changed;

}


bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,

                                AMDGPU::CPol::CPol Value) const {

  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);

  if (!CPol)

    return false;


  uint64_t NewTH = Value & AMDGPU::CPol::TH;

  if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {

    CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);

    return true;

  }


  return false;

}


bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,

                                   AMDGPU::CPol::CPol Value) const {

  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);

  if (!CPol)

    return false;


  uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;

  if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {

    CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);

    return true;

  }


  return false;

}


bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(

    const MachineBasicBlock::iterator MI) const {

  // TODO: implement flag for frontend to give us a hint not to insert waits.


  MachineBasicBlock &MBB = *MI->getParent();

  const DebugLoc &DL = MI->getDebugLoc();


  BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);

  if (ST.hasImageInsts()) {

    BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);

    BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);

  }

  BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);

  BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);


  return true;

}


bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,

                                     SIAtomicScope Scope,

                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,

                                     bool IsCrossAddrSpaceOrdering,

                                     Position Pos, AtomicOrdering Order) const {

  bool Changed = false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  bool LOADCnt = false;

  bool DSCnt = false;

  bool STORECnt = false;


  if (Pos == Position::AFTER)

    ++MI;


  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=

      SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

    case SIAtomicScope::CLUSTER:

      if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)

        LOADCnt |= true;

      if ((Op & SIMemOp::STORE) != SIMemOp::NONE)

        STORECnt |= true;

      break;

    case SIAtomicScope::WORKGROUP:

      // GFX12.0:

      //   In WGP mode the waves of a work-group can be executing on either CU

      //   of the WGP. Therefore need to wait for operations to complete to

      //   ensure they are visible to waves in the other CU as the L0 is per CU.

      //   Otherwise in CU mode and all waves of a work-group are on the same CU

      //   which shares the same L0.

      //

      // GFX12.5:

      //   TODO DOCS

      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {

        if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)

          LOADCnt |= true;

        if ((Op & SIMemOp::STORE) != SIMemOp::NONE)

          STORECnt |= true;

      }

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The L0 cache keeps all memory operations in order for

      // work-items in the same wavefront.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

    case SIAtomicScope::AGENT:

    case SIAtomicScope::CLUSTER:

    case SIAtomicScope::WORKGROUP:

      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is

      // not needed as LDS operations for all waves are executed in a total

      // global ordering as observed by all waves. Required if also

      // synchronizing with global/GDS memory as LDS operations could be

      // reordered with respect to later global/GDS memory operations of the

      // same wave.

      DSCnt |= IsCrossAddrSpaceOrdering;

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // The LDS keeps all memory operations in order for

      // the same wavefront.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  if (LOADCnt) {

    // Acquire sequences only need to wait on the previous atomic operation.

    // e.g. a typical sequence looks like

    //    atomic load

    //    (wait)

    //    global_inv

    //

    // We do not have BVH or SAMPLE atomics, so the atomic load is always going

    // to be tracked using loadcnt.

    //

    // This also applies to fences. Fences cannot pair with an instruction

    // tracked with bvh/samplecnt as we don't have any atomics that do that.

    if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);

    }

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);

    Changed = true;

  }


  if (STORECnt) {

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);

    Changed = true;

  }


  if (DSCnt) {

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);

    Changed = true;

  }


  if (Pos == Position::AFTER)

    --MI;


  return Changed;

}


bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

                                        SIAtomicScope Scope,

                                        SIAtomicAddrSpace AddrSpace,

                                        Position Pos) const {

  if (!InsertCacheInv)

    return false;


  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  /// The scratch address space does not need the global memory cache

  /// to be flushed as all memory operations by the same thread are

  /// sequentially consistent, and no other thread can access scratch

  /// memory.


  /// Other address spaces do not have a cache.

  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)

    return false;


  AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;

  switch (Scope) {

  case SIAtomicScope::SYSTEM:

    ScopeImm = AMDGPU::CPol::SCOPE_SYS;

    break;

  case SIAtomicScope::AGENT:

    ScopeImm = AMDGPU::CPol::SCOPE_DEV;

    break;

  case SIAtomicScope::CLUSTER:

    ScopeImm = AMDGPU::CPol::SCOPE_SE;

    break;

  case SIAtomicScope::WORKGROUP:

    // GFX12.0:

    //  In WGP mode the waves of a work-group can be executing on either CU of

    //  the WGP. Therefore we need to invalidate the L0 which is per CU.

    //  Otherwise in CU mode all waves of a work-group are on the same CU, and

    //  so the L0 does not need to be invalidated.

    //

    // GFX12.5

    //   TODO DOCS

    if (ST.isCuModeEnabled())

      return false;


    ScopeImm = AMDGPU::CPol::SCOPE_SE;

    break;

  case SIAtomicScope::WAVEFRONT:

  case SIAtomicScope::SINGLETHREAD:

    // No cache to invalidate.

    return false;

  default:

    llvm_unreachable("Unsupported synchronization scope");

  }


  if (Pos == Position::AFTER)

    ++MI;


  BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);


  if (Pos == Position::AFTER)

    --MI;


  return true;

}


bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,

                                        SIAtomicScope Scope,

                                        SIAtomicAddrSpace AddrSpace,

                                        bool IsCrossAddrSpaceOrdering,

                                        Position Pos) const {

  MachineBasicBlock &MBB = *MI->getParent();

  DebugLoc DL = MI->getDebugLoc();


  // The scratch address space does not need the global memory cache

  // writeback as all memory operations by the same thread are

  // sequentially consistent, and no other thread can access scratch

  // memory.


  // Other address spaces do not have a cache.

  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)

    return false;


  if (Pos == Position::AFTER)

    ++MI;


  // global_wb is only necessary at system scope for GFX12.0,

  // they're also necessary at device scope for GFX12.5.

  //

  // Emitting it for lower scopes is a slow no-op, so we omit it

  // for performance.

  switch (Scope) {

  case SIAtomicScope::SYSTEM:

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))

        .addImm(AMDGPU::CPol::SCOPE_SYS);

    break;

  case SIAtomicScope::AGENT:

    // TODO DOCS

    if (ST.hasGFX1250Insts()) {

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))

          .addImm(AMDGPU::CPol::SCOPE_DEV);

    }

    break;

  case SIAtomicScope::CLUSTER:

  case SIAtomicScope::WORKGROUP:

    // No WB necessary, but we still have to wait.

    break;

  case SIAtomicScope::WAVEFRONT:

  case SIAtomicScope::SINGLETHREAD:

    // No WB or wait necessary here.

    return false;

  default:

    llvm_unreachable("Unsupported synchronization scope");

  }


  if (Pos == Position::AFTER)

    --MI;


  // We always have to wait for previous memory operations (load/store) to

  // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),

  // we of course need to wait for that as well.

  insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,

             IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);


  return true;

}


bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(

    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,

    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {


  // Only handle load and store, not atomic read-modify-write instructions.

  assert(MI->mayLoad() ^ MI->mayStore());


  // Only update load and store, not LLVM IR atomic read-modify-write

  // instructions. The latter are always marked as volatile so cannot sensibly

  // handle it as do not want to pessimize all atomics. Also they do not support

  // the nontemporal attribute.

  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);


  bool Changed = false;


  if (IsLastUse) {

    // Set last-use hint.

    Changed |= setTH(MI, AMDGPU::CPol::TH_LU);

  } else if (IsNonTemporal) {

    // Set non-temporal hint for all cache levels.

    Changed |= setTH(MI, AMDGPU::CPol::TH_NT);

  }


  if (IsVolatile) {

    Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);


    // Ensure operation has completed at system scope to cause all volatile

    // operations to be visible outside the program in a global order. Do not

    // request cross address space as only the global address space can be

    // observable outside the program, so no need to cause a waitcnt for LDS

    // address space operations.

    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,

                          Position::AFTER, AtomicOrdering::Unordered);

  }


  return Changed;

}


bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {

  assert(MI.mayStore() && "Not a Store inst");

  const bool IsRMW = (MI.mayLoad() && MI.mayStore());

  bool Changed = false;


  // GFX12.5 only: xcnt wait is needed before flat and global atomics

  // stores/rmw.

  if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {

    MachineBasicBlock &MBB = *MI.getParent();

    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);

    Changed = true;

  }


  // Remaining fixes do not apply to RMWs.

  if (IsRMW)

    return Changed;


  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);

  if (!CPol) // Some vmem operations do not have a scope and are not concerned.

    return Changed;

  const unsigned Scope = CPol->getImm() & CPol::SCOPE;


  // GFX12.0 only: Extra waits needed before system scope stores.

  if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)

    Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());


  return Changed;

}


bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {

  if (!ST.hasGFX1250Insts())

    return false;


  // Cooperative atomics need to be SCOPE_DEV or higher.

  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);

  assert(CPol && "No CPol operand?");

  const unsigned Scope = CPol->getImm() & CPol::SCOPE;

  if (Scope < CPol::SCOPE_DEV)

    return setScope(MI, CPol::SCOPE_DEV);

  return false;

}


bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,

                                         SIAtomicScope Scope,

                                         SIAtomicAddrSpace AddrSpace) const {

  bool Changed = false;


  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {

    switch (Scope) {

    case SIAtomicScope::SYSTEM:

      Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);

      break;

    case SIAtomicScope::AGENT:

      Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);

      break;

    case SIAtomicScope::CLUSTER:

      Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);

      break;

    case SIAtomicScope::WORKGROUP:

      // In workgroup mode, SCOPE_SE is needed as waves can executes on

      // different CUs that access different L0s.

      if (!ST.isCuModeEnabled())

        Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);

      break;

    case SIAtomicScope::WAVEFRONT:

    case SIAtomicScope::SINGLETHREAD:

      // No cache to bypass.

      break;

    default:

      llvm_unreachable("Unsupported synchronization scope");

    }

  }


  // The scratch address space does not need the global memory caches

  // to be bypassed as all memory operations by the same thread are

  // sequentially consistent, and no other thread can access scratch

  // memory.


  // Other address spaces do not have a cache.


  return Changed;

}


bool SIMemoryLegalizer::removeAtomicPseudoMIs() {

  if (AtomicPseudoMIs.empty())

    return false;


  for (auto &MI : AtomicPseudoMIs)

    MI->eraseFromParent();


  AtomicPseudoMIs.clear();

  return true;

}


bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,

                                   MachineBasicBlock::iterator &MI) {

  assert(MI->mayLoad() && !MI->mayStore());


  bool Changed = false;


  if (MOI.isAtomic()) {

    const AtomicOrdering Order = MOI.getOrdering();

    if (Order == AtomicOrdering::Monotonic ||

        Order == AtomicOrdering::Acquire ||

        Order == AtomicOrdering::SequentiallyConsistent) {

      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),

                                           MOI.getOrderingAddrSpace());

    }


    // Handle cooperative atomics after cache bypass step, as it may override

    // the scope of the instruction to a greater scope.

    if (MOI.isCooperative())

      Changed |= CC->handleCooperativeAtomic(*MI);


    if (Order == AtomicOrdering::SequentiallyConsistent)

      Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),

                                SIMemOp::LOAD | SIMemOp::STORE,

                                MOI.getIsCrossAddressSpaceOrdering(),

                                Position::BEFORE, Order);


    if (Order == AtomicOrdering::Acquire ||

        Order == AtomicOrdering::SequentiallyConsistent) {

      Changed |= CC->insertWait(

          MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,

          MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);

      Changed |= CC->insertAcquire(MI, MOI.getScope(),

                                   MOI.getOrderingAddrSpace(),

                                   Position::AFTER);

    }


    return Changed;

  }


  // Atomic instructions already bypass caches to the scope specified by the

  // SyncScope operand. Only non-atomic volatile and nontemporal/last-use

  // instructions need additional treatment.

  Changed |= CC->enableVolatileAndOrNonTemporal(

      MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),

      MOI.isNonTemporal(), MOI.isLastUse());


  return Changed;

}


bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,

                                    MachineBasicBlock::iterator &MI) {

  assert(!MI->mayLoad() && MI->mayStore());


  bool Changed = false;

  // FIXME: Necessary hack because iterator can lose track of the store.

  MachineInstr &StoreMI = *MI;


  if (MOI.isAtomic()) {

    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||

        MOI.getOrdering() == AtomicOrdering::Release ||

        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {

      Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),

                                            MOI.getOrderingAddrSpace());

    }


    // Handle cooperative atomics after cache bypass step, as it may override

    // the scope of the instruction to a greater scope.

    if (MOI.isCooperative())

      Changed |= CC->handleCooperativeAtomic(*MI);


    if (MOI.getOrdering() == AtomicOrdering::Release ||

        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)

      Changed |= CC->insertRelease(MI, MOI.getScope(),

                                   MOI.getOrderingAddrSpace(),

                                   MOI.getIsCrossAddressSpaceOrdering(),

                                   Position::BEFORE);


    Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);

    return Changed;

  }


  // Atomic instructions already bypass caches to the scope specified by the

  // SyncScope operand. Only non-atomic volatile and nontemporal instructions

  // need additional treatment.

  Changed |= CC->enableVolatileAndOrNonTemporal(

      MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),

      MOI.isNonTemporal());


  // GFX12 specific, scope(desired coherence domain in cache hierarchy) is

  // instruction field, do not confuse it with atomic scope.

  Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);

  return Changed;

}


bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,

                                          MachineBasicBlock::iterator &MI) {

  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);


  AtomicPseudoMIs.push_back(MI);

  bool Changed = false;


  const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();


  if (MOI.isAtomic()) {

    const AtomicOrdering Order = MOI.getOrdering();

    if (Order == AtomicOrdering::Acquire) {

      Changed |= CC->insertWait(

          MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,

          MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);

    }


    if (Order == AtomicOrdering::Release ||

        Order == AtomicOrdering::AcquireRelease ||

        Order == AtomicOrdering::SequentiallyConsistent)

      /// TODO: This relies on a barrier always generating a waitcnt

      /// for LDS to ensure it is not reordered with the completion of

      /// the proceeding LDS operations. If barrier had a memory

      /// ordering and memory scope, then library does not need to

      /// generate a fence. Could add support in this file for

      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally

      /// adding S_WAITCNT before a S_BARRIER.

      Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,

                                   MOI.getIsCrossAddressSpaceOrdering(),

                                   Position::BEFORE);


    // TODO: If both release and invalidate are happening they could be combined

    // to use the single "BUFFER_WBINV*" instruction. This could be done by

    // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to

    // track cache invalidate and write back instructions.


    if (Order == AtomicOrdering::Acquire ||

        Order == AtomicOrdering::AcquireRelease ||

        Order == AtomicOrdering::SequentiallyConsistent)

      Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,

                                   Position::BEFORE);


    return Changed;

  }


  return Changed;

}


bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,

  MachineBasicBlock::iterator &MI) {

  assert(MI->mayLoad() && MI->mayStore());


  bool Changed = false;

  MachineInstr &RMWMI = *MI;


  if (MOI.isAtomic()) {

    const AtomicOrdering Order = MOI.getOrdering();

    if (Order == AtomicOrdering::Monotonic ||

        Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||

        Order == AtomicOrdering::AcquireRelease ||

        Order == AtomicOrdering::SequentiallyConsistent) {

      Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),

                                          MOI.getInstrAddrSpace());

    }


    if (Order == AtomicOrdering::Release ||

        Order == AtomicOrdering::AcquireRelease ||

        Order == AtomicOrdering::SequentiallyConsistent ||

        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)

      Changed |= CC->insertRelease(MI, MOI.getScope(),

                                   MOI.getOrderingAddrSpace(),

                                   MOI.getIsCrossAddressSpaceOrdering(),

                                   Position::BEFORE);


    if (Order == AtomicOrdering::Acquire ||

        Order == AtomicOrdering::AcquireRelease ||

        Order == AtomicOrdering::SequentiallyConsistent ||

        MOI.getFailureOrdering() == AtomicOrdering::Acquire ||

        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {

      Changed |= CC->insertWait(

          MI, MOI.getScope(), MOI.getInstrAddrSpace(),

          isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,

          MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);

      Changed |= CC->insertAcquire(MI, MOI.getScope(),

                                   MOI.getOrderingAddrSpace(),

                                   Position::AFTER);

    }


    Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);

    return Changed;

  }


  return Changed;

}


bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {

  const MachineModuleInfo &MMI =

      getAnalysis<MachineModuleInfoWrapperPass>().getMMI();

  return SIMemoryLegalizer(MMI).run(MF);

}


PreservedAnalyses


SIMemoryLegalizerPass::run(MachineFunction &MF,

                           MachineFunctionAnalysisManager &MFAM) {

  auto *MMI = MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)

                  .getCachedResult<MachineModuleAnalysis>(

                      *MF.getFunction().getParent());

  assert(MMI && "MachineModuleAnalysis must be available");

  if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))

    return PreservedAnalyses::all();

  return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();

}


bool SIMemoryLegalizer::run(MachineFunction &MF) {

  bool Changed = false;


  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);

  CC = SICacheControl::create(ST);


  for (auto &MBB : MF) {

    for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {


      // Unbundle instructions after the post-RA scheduler.

      if (MI->isBundle() && MI->mayLoadOrStore()) {

        MachineBasicBlock::instr_iterator II(MI->getIterator());

        for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();

             I != E && I->isBundledWithPred(); ++I) {

          I->unbundleFromPred();

          for (MachineOperand &MO : I->operands())

            if (MO.isReg())

              MO.setIsInternalRead(false);

        }


        MI->eraseFromParent();

        MI = II->getIterator();

      }


      if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {

        Changed |= CC->insertBarrierStart(MI);

        continue;

      }


      if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))

        continue;


      if (const auto &MOI = MOA.getLoadInfo(MI))

        Changed |= expandLoad(*MOI, MI);

      else if (const auto &MOI = MOA.getStoreInfo(MI)) {

        Changed |= expandStore(*MOI, MI);

      } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))

        Changed |= expandAtomicFence(*MOI, MI);

      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))

        Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);

    }

  }


  Changed |= removeAtomicPseudoMIs();

  return Changed;

}


INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)


char SIMemoryLegalizerLegacy::ID = 0;

char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;


FunctionPass *llvm::createSIMemoryLegalizerPass() {

  return new SIMemoryLegalizerLegacy();

}


getLoadInfo
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Definition AArch64FalkorHWPFFix.cpp:228

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPUMachineModuleInfo.h
AMDGPU Machine Module Info.

LDS
AMDGPU promote alloca to vector or LDS
Definition AMDGPUPromoteAlloca.cpp:207

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

AtomicOrdering.h
Atomic ordering constants.

BitmaskEnum.h

DiagnosticInfo.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

PassManager.h
This header defines various interfaces for pass management in LLVM.

I
#define I(x, y, z)
Definition MD5.cpp:58

MachineBasicBlock.h

MachineFunctionPass.h

MachinePassManager.h

MemoryModelRelaxationAnnotations.h
This file provides utility for Memory Model Relaxation Annotations (MMRAs).

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56

AmdgcnSkipCacheInvalidations
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))

PASS_NAME
#define PASS_NAME
Definition SIMemoryLegalizer.cpp:35

StringExtras.h
This file contains some functions that are useful when dealing with strings.

TargetParser.h

PASS_NAME
#define PASS_NAME
Definition TypePromotion.cpp:43

IV
static const uint32_t IV[8]
Definition blake3_impl.h:83

llvm::AMDGPUMachineModuleInfo
Definition AMDGPUMachineModuleInfo.h:23

llvm::AMDGPUMachineModuleInfo::getWorkgroupSSID
SyncScope::ID getWorkgroupSSID() const
Definition AMDGPUMachineModuleInfo.h:99

llvm::AMDGPUMachineModuleInfo::getWavefrontSSID
SyncScope::ID getWavefrontSSID() const
Definition AMDGPUMachineModuleInfo.h:103

llvm::AMDGPUMachineModuleInfo::getAgentSSID
SyncScope::ID getAgentSSID() const
Definition AMDGPUMachineModuleInfo.h:95

llvm::AMDGPUMachineModuleInfo::getClusterOneAddressSpaceSSID
SyncScope::ID getClusterOneAddressSpaceSSID() const
Definition AMDGPUMachineModuleInfo.h:129

llvm::AMDGPUMachineModuleInfo::getClusterSSID
SyncScope::ID getClusterSSID() const
Definition AMDGPUMachineModuleInfo.h:107

llvm::AMDGPUMachineModuleInfo::isSyncScopeInclusion
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
Definition AMDGPUMachineModuleInfo.h:141

llvm::AMDGPUMachineModuleInfo::getAgentOneAddressSpaceSSID
SyncScope::ID getAgentOneAddressSpaceSSID() const
Definition AMDGPUMachineModuleInfo.h:113

llvm::AMDGPUMachineModuleInfo::getSingleThreadOneAddressSpaceSSID
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
Definition AMDGPUMachineModuleInfo.h:125

llvm::AMDGPUMachineModuleInfo::getWavefrontOneAddressSpaceSSID
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
Definition AMDGPUMachineModuleInfo.h:121

llvm::AMDGPUMachineModuleInfo::getSystemOneAddressSpaceSSID
SyncScope::ID getSystemOneAddressSpaceSSID() const
Definition AMDGPUMachineModuleInfo.h:109

llvm::AMDGPUMachineModuleInfo::getWorkgroupOneAddressSpaceSSID
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
Definition AMDGPUMachineModuleInfo.h:117

llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition AMDGPUSubtarget.h:196

llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition AMDGPUSubtarget.h:192

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:412

llvm::AnalysisUsage::setPreservesCFG
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1101

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:64

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition GlobalValue.h:663

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:245

llvm::ListSeparator
A helper class to return the specified delimiter string after the first invocation of operator String...
Definition StringExtras.h:529

llvm::MMRAMetadata
Helper class to manipulate !mmra metadata nodes.
Definition MemoryModelRelaxationAnnotations.h:47

llvm::MachineBasicBlock::instr_iterator
Instructions::iterator instr_iterator
Definition MachineBasicBlock.h:336

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:323

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:341

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition MachineFunctionPass.h:31

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition MachineFunctionPass.cpp:184

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:160

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:126

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:72

llvm::MachineModuleInfo::getObjFileInfo
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
Definition MachineModuleInfo.h:159

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:48

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition SIInstrInfo.h:729

llvm::SIMemoryLegalizerPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SIMemoryLegalizer.cpp:2940

llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26

llvm::StringLiteral
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862

llvm::StringMap
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::cl::opt
Definition CommandLine.h:1429

llvm::raw_svector_ostream
A raw_ostream that writes to an SmallVector or SmallString.
Definition raw_ostream.h:692

Changed
Changed
Definition ObjCARCOpts.cpp:2370

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64CC::LS
@ LS
Definition AArch64BaseInfo.h:264

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:30

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:31

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPU::CPol::CPol
CPol
Definition SIDefines.h:367

llvm::AMDGPU::CPol::DLC
@ DLC
Definition SIDefines.h:370

llvm::AMDGPU::CPol::NT
@ NT
Definition SIDefines.h:374

llvm::AMDGPU::CPol::SC0
@ SC0
Definition SIDefines.h:372

llvm::AMDGPU::CPol::SLC
@ SLC
Definition SIDefines.h:369

llvm::AMDGPU::CPol::SC1
@ SC1
Definition SIDefines.h:373

llvm::AMDGPU::CPol::GLC
@ GLC
Definition SIDefines.h:368

llvm::AMDGPU::CPol::SCOPE_SYS
@ SCOPE_SYS
Definition SIDefines.h:407

llvm::AMDGPU::CPol::SCOPE_SE
@ SCOPE_SE
Definition SIDefines.h:405

llvm::AMDGPU::CPol::TH_LU
@ TH_LU
Definition SIDefines.h:385

llvm::AMDGPU::CPol::SCOPE_DEV
@ SCOPE_DEV
Definition SIDefines.h:406

llvm::AMDGPU::CPol::SCOPE
@ SCOPE
Definition SIDefines.h:403

llvm::AMDGPU::CPol::TH_NT
@ TH_NT
Definition SIDefines.h:383

llvm::AMDGPU::CPol::TH
@ TH
Definition SIDefines.h:381

llvm::AMDGPU::DepCtr::encodeFieldVmVsrc
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
Definition AMDGPUBaseInfo.cpp:2052

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition AMDGPUMetadata.h:201

llvm::AMDGPU
Definition AMDGPUMetadataVerifier.h:34

llvm::AMDGPU::getIsaVersion
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Definition TargetParser.cpp:275

llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition AMDGPUBaseInfo.cpp:1822

llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1714

llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1736

llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1732

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::GraphProgram::Name
Name
Definition GraphWriter.h:51

llvm::M68k::MemAddrModeKind::u
@ u
Definition M68kBaseInfo.h:60

llvm::NVPTX::Ordering
Ordering
Definition NVPTX.h:154

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:55

llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition SIDefines.h:120

llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55

llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58

llvm::SyncScope::ID
uint8_t ID
Definition LLVMContext.h:47

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:409

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition PointerTypeAnalysis.cpp:205

llvm::jitlink::Scope
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
Definition JITLink.h:413

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::rdf::Func
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::ModuleAnalysisManagerMachineFunctionProxy
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Definition MachinePassManager.h:126

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:369

llvm::SIMemoryLegalizerID
char & SIMemoryLegalizerID
Definition SIMemoryLegalizer.cpp:3002

llvm::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()

llvm::NONE
@ NONE
Definition Attributor.h:6612

llvm::MachineFunctionAnalysisManager
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Definition MachineFunctionAnalysisManager.h:24

llvm::ALL
@ ALL
Definition Attributor.h:6615

llvm::isReleaseOrStronger
bool isReleaseOrStronger(AtomicOrdering AO)
Definition AtomicOrdering.h:133

llvm::getMachineFunctionPassPreservedAnalyses
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition MachinePassManager.cpp:162

llvm::getImm
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition SPIRVUtils.cpp:976

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288

llvm::getMergedAtomicOrdering
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
Definition AtomicOrdering.h:139

llvm::MOCooperative
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::ModRefInfo::LLVM_MARK_AS_BITMASK_ENUM
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::DS_Warning
@ DS_Warning
Definition DiagnosticInfo.h:53

llvm::MOLastUse
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48

llvm::createSIMemoryLegalizerPass
FunctionPass * createSIMemoryLegalizerPass()
Definition SIMemoryLegalizer.cpp:3004

llvm::cl::desc
Definition CommandLine.h:410