LLVM: lib/Target/AMDGPU/AMDGPUSubtarget.h Source File

//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//==-----------------------------------------------------------------------===//

//

/// \file

/// Base class for AMDGPU specific classes of TargetSubtarget.

//

//===----------------------------------------------------------------------===//


#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H

#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H


#include "llvm/ADT/SmallVector.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/Support/Alignment.h"

#include "llvm/TargetParser/Triple.h"


namespace llvm {


enum AMDGPUDwarfFlavour : unsigned;

class Function;

class Instruction;

class MachineFunction;

class TargetMachine;


class AMDGPUSubtarget {

public:

  enum Generation {

    INVALID = 0,

    R600 = 1,

    R700 = 2,

    EVERGREEN = 3,

    NORTHERN_ISLANDS = 4,

    SOUTHERN_ISLANDS = 5,

    SEA_ISLANDS = 6,

    VOLCANIC_ISLANDS = 7,

    GFX9 = 8,

    GFX10 = 9,

    GFX11 = 10,

    GFX12 = 11,

  };


private:

  Triple TargetTriple;


protected:

  bool GCN3Encoding = false;

  bool Has16BitInsts = false;

  bool HasTrue16BitInsts = false;

  bool HasFP8ConversionScaleInsts = false;

  bool HasBF8ConversionScaleInsts = false;

  bool HasFP4ConversionScaleInsts = false;

  bool HasFP6BF6ConversionScaleInsts = false;

  bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;

  bool HasCvtPkF16F32Inst = false;

  bool HasF32ToF16BF16ConversionSRInsts = false;

  bool EnableRealTrue16Insts = false;

  bool HasBF16TransInsts = false;

  bool HasBF16ConversionInsts = false;

  bool HasBF16PackedInsts = false;

  bool HasMadMixInsts = false;

  bool HasMadMacF32Insts = false;

  bool HasDsSrc2Insts = false;

  bool HasSDWA = false;

  bool HasVOP3PInsts = false;

  bool HasMulI24 = true;

  bool HasMulU24 = true;

  bool HasSMulHi = false;

  bool HasInv2PiInlineImm = false;

  bool HasFminFmaxLegacy = true;

  bool EnablePromoteAlloca = false;

  bool HasTrigReducedRange = false;

  bool FastFMAF32 = false;

  unsigned EUsPerCU = 4;

  unsigned MaxWavesPerEU = 10;

  unsigned LocalMemorySize = 0;

  unsigned AddressableLocalMemorySize = 0;

  char WavefrontSizeLog2 = 0;


public:

  AMDGPUSubtarget(Triple TT);


  static const AMDGPUSubtarget &get(const MachineFunction &MF);

  static const AMDGPUSubtarget &get(const TargetMachine &TM,

                                    const Function &F);


  /// \returns Default range flat work group size for a calling convention.

  std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;


  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes

  /// for function \p F, or minimum/maximum flat work group sizes explicitly

  /// requested using "amdgpu-flat-work-group-size" attribute attached to

  /// function \p F.

  ///

  /// \returns Subtarget's default values if explicitly requested values cannot

  /// be converted to integer, or violate subtarget's specifications.

  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;


  /// \returns The required size of workgroups that will be used to execute \p F

  /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`

  /// metadata. Otherwise, returns std::nullopt.

  std::optional<unsigned> getReqdWorkGroupSize(const Function &F,

                                               unsigned Dim) const;


  /// \returns true if \p F will execute in a manner that leaves the X

  /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /

  /// wavefrontsize is uniform. This is true if either the Y and Z block

  /// dimensions are known to always be 1 or if the X dimension will always be a

  /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and

  /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with

  /// wavesize64 would ordinarily pass this test, it won't with

  /// \pRequiresUniformYZ).

  ///

  /// This information is currently only gathered from the !reqd_work_group_size

  /// metadata on \p F, but this may be improved in the future.

  bool hasWavefrontsEvenlySplittingXDim(const Function &F,

                                        bool REquiresUniformYZ = false) const;


  /// \returns Subtarget's default pair of minimum/maximum number of waves per

  /// execution unit for function \p F, or minimum/maximum number of waves per

  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute

  /// attached to function \p F.

  ///

  /// \returns Subtarget's default values if explicitly requested values cannot

  /// be converted to integer, violate subtarget's specifications, or are not

  /// compatible with minimum/maximum number of waves limited by flat work group

  /// size, register usage, and/or lds usage.

  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;


  /// Overload which uses the specified values for the flat work group sizes,

  /// rather than querying the function itself. \p FlatWorkGroupSizes Should

  /// correspond to the function's value for getFlatWorkGroupSizes.

  std::pair<unsigned, unsigned>

  getWavesPerEU(const Function &F,

                std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;


  /// Overload which uses the specified values for the flat workgroup sizes and

  /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes

  /// should correspond to the function's value for getFlatWorkGroupSizes and \p

  /// LDSBytes to the per-workgroup LDS allocation.

  std::pair<unsigned, unsigned>

  getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,

                unsigned LDSBytes, const Function &F) const;


  /// Returns the target minimum/maximum number of waves per EU. This is based

  /// on the minimum/maximum number of \p RequestedWavesPerEU and further

  /// limited by the maximum achievable occupancy derived from the range of \p

  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.

  std::pair<unsigned, unsigned>

  getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,

                         std::pair<unsigned, unsigned> FlatWorkGroupSizes,

                         unsigned LDSBytes) const;


  /// Return the amount of LDS that can be used that will not restrict the

  /// occupancy lower than WaveCount.

  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,

                                           const Function &) const;


  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can

  /// be achieved when the only function running on a CU is \p F and each

  /// workgroup running the function requires \p LDSBytes bytes of LDS space.

  /// This notably depends on the range of allowed flat group sizes for the

  /// function and hardware characteristics.

  std::pair<unsigned, unsigned>

  getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {

    return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F));

  }


  /// Overload which uses the specified values for the flat work group sizes,

  /// rather than querying the function itself. \p FlatWorkGroupSizes should

  /// correspond to the function's value for getFlatWorkGroupSizes.

  std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(

      uint32_t LDSBytes,

      std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;


  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can

  /// be achieved when the only function running on a CU is \p MF. This notably

  /// depends on the range of allowed flat group sizes for the function, the

  /// amount of per-workgroup LDS space required by the function, and hardware

  /// characteristics.

  std::pair<unsigned, unsigned>

  getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;


  bool isAmdHsaOS() const {

    return TargetTriple.getOS() == Triple::AMDHSA;

  }


  bool isAmdPalOS() const {

    return TargetTriple.getOS() == Triple::AMDPAL;

  }


  bool isMesa3DOS() const {

    return TargetTriple.getOS() == Triple::Mesa3D;

  }


  bool isMesaKernel(const Function &F) const;


  bool isAmdHsaOrMesa(const Function &F) const {

    return isAmdHsaOS() || isMesaKernel(F);

  }


  bool isGCN() const { return TargetTriple.isAMDGCN(); }


  bool isGCN3Encoding() const {

    return GCN3Encoding;

  }


  bool has16BitInsts() const {

    return Has16BitInsts;

  }


  /// Return true if the subtarget supports True16 instructions.

  bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }


  /// Return true if real (non-fake) variants of True16 instructions using

  /// 16-bit registers should be code-generated. Fake True16 instructions are

  /// identical to non-fake ones except that they take 32-bit registers as

  /// operands and always use their low halves.

  // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully

  // supported and the support for fake True16 instructions is removed.

  bool useRealTrue16Insts() const;


  bool hasBF16TransInsts() const { return HasBF16TransInsts; }


  bool hasBF16ConversionInsts() const {

    return HasBF16ConversionInsts;

  }


  bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }


  bool hasMadMixInsts() const {

    return HasMadMixInsts;

  }


  bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }


  bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }


  bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }


  bool hasFP6BF6ConversionScaleInsts() const {

    return HasFP6BF6ConversionScaleInsts;

  }


  bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {

    return HasF16BF16ToFP6BF6ConversionScaleInsts;

  }


  bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }


  bool hasF32ToF16BF16ConversionSRInsts() const {

    return HasF32ToF16BF16ConversionSRInsts;

  }


  bool hasMadMacF32Insts() const {

    return HasMadMacF32Insts || !isGCN();

  }


  bool hasDsSrc2Insts() const {

    return HasDsSrc2Insts;

  }


  bool hasSDWA() const {

    return HasSDWA;

  }


  bool hasVOP3PInsts() const {

    return HasVOP3PInsts;

  }


  bool hasMulI24() const {

    return HasMulI24;

  }


  bool hasMulU24() const {

    return HasMulU24;

  }


  bool hasSMulHi() const {

    return HasSMulHi;

  }


  bool hasInv2PiInlineImm() const {

    return HasInv2PiInlineImm;

  }


  bool hasFminFmaxLegacy() const {

    return HasFminFmaxLegacy;

  }


  bool hasTrigReducedRange() const {

    return HasTrigReducedRange;

  }


  bool hasFastFMAF32() const {

    return FastFMAF32;

  }


  bool isPromoteAllocaEnabled() const {

    return EnablePromoteAlloca;

  }


  unsigned getWavefrontSize() const {

    return 1 << WavefrontSizeLog2;

  }


  unsigned getWavefrontSizeLog2() const {

    return WavefrontSizeLog2;

  }


  /// Return the maximum number of bytes of LDS available for all workgroups

  /// running on the same WGP or CU.

  /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is

  /// limited to 64k.

  unsigned getLocalMemorySize() const {

    return LocalMemorySize;

  }


  /// Return the maximum number of bytes of LDS that can be allocated to a

  /// single workgroup.

  /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has

  /// 128k in total.

  unsigned getAddressableLocalMemorySize() const {

    return AddressableLocalMemorySize;

  }


  /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the

  /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.

  /// CU mode into account.

  unsigned getEUsPerCU() const { return EUsPerCU; }


  Align getAlignmentForImplicitArgPtr() const {

    return isAmdHsaOS() ? Align(8) : Align(4);

  }


  /// Returns the offset in bytes from the start of the input buffer

  ///        of the first explicit kernel argument.

  unsigned getExplicitKernelArgOffset() const {

    switch (TargetTriple.getOS()) {

    case Triple::AMDHSA:

    case Triple::AMDPAL:

    case Triple::Mesa3D:

      return 0;

    case Triple::UnknownOS:

    default:

      // For legacy reasons unknown/other is treated as a different version of

      // mesa.

      return 36;

    }


    llvm_unreachable("invalid triple OS");

  }


  /// \returns Maximum number of work groups per compute unit supported by the

  /// subtarget and limited by given \p FlatWorkGroupSize.

  virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;


  /// \returns Minimum flat work group size supported by the subtarget.

  virtual unsigned getMinFlatWorkGroupSize() const = 0;


  /// \returns Maximum flat work group size supported by the subtarget.

  virtual unsigned getMaxFlatWorkGroupSize() const = 0;


  /// \returns Number of waves per execution unit required to support the given

  /// \p FlatWorkGroupSize.

  virtual unsigned

  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;


  /// \returns Minimum number of waves per execution unit supported by the

  /// subtarget.

  virtual unsigned getMinWavesPerEU() const = 0;


  /// \returns Maximum number of waves per execution unit supported by the

  /// subtarget without any kind of limitation.

  unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }


  /// Return the maximum workitem ID value in the function, for the given (0, 1,

  /// 2) dimension.

  unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;


  /// Return the number of work groups for the function.

  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;


  /// Return true if only a single workitem can be active in a wave.

  bool isSingleLaneExecution(const Function &Kernel) const;


  /// Creates value range metadata on an workitemid.* intrinsic call or load.

  bool makeLIDRangeMetadata(Instruction *I) const;


  /// \returns Number of bytes of arguments that are passed to a shader or

  /// kernel in addition to the explicit ones declared for the function.

  unsigned getImplicitArgNumBytes(const Function &F) const;

  uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;

  unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;


  /// \returns Corresponding DWARF register number mapping flavour for the

  /// \p WavefrontSize.

  AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;


  virtual ~AMDGPUSubtarget() = default;

};


} // end namespace llvm


#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H

Alignment.h

CallingConv.h

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

SmallVector.h
This file defines the SmallVector class.

Triple.h

llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:30

llvm::AMDGPUSubtarget::hasFP8ConversionScaleInsts
bool hasFP8ConversionScaleInsts() const
Definition: AMDGPUSubtarget.h:239

llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:196

llvm::AMDGPUSubtarget::hasFminFmaxLegacy
bool hasFminFmaxLegacy() const
Definition: AMDGPUSubtarget.h:291

llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition: AMDGPUSubtarget.h:74

llvm::AMDGPUSubtarget::getDefaultFlatWorkGroupSize
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition: AMDGPUSubtarget.cpp:145

llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition: AMDGPUSubtarget.h:192

llvm::AMDGPUSubtarget::HasBF16TransInsts
bool HasBF16TransInsts
Definition: AMDGPUSubtarget.h:62

llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition: AMDGPUSubtarget.h:82

llvm::AMDGPUSubtarget::EnableRealTrue16Insts
bool EnableRealTrue16Insts
Definition: AMDGPUSubtarget.h:61

llvm::AMDGPUSubtarget::hasBF16PackedInsts
bool hasBF16PackedInsts() const
Definition: AMDGPUSubtarget.h:233

llvm::AMDGPUSubtarget::hasSDWA
bool hasSDWA() const
Definition: AMDGPUSubtarget.h:267

llvm::AMDGPUSubtarget::hasFP4ConversionScaleInsts
bool hasFP4ConversionScaleInsts() const
Definition: AMDGPUSubtarget.h:243

llvm::AMDGPUSubtarget::getReqdWorkGroupSize
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Definition: AMDGPUSubtarget.cpp:233

llvm::AMDGPUSubtarget::getAlignmentForImplicitArgPtr
Align getAlignmentForImplicitArgPtr() const
Definition: AMDGPUSubtarget.h:336

llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:259

llvm::AMDGPUSubtarget::HasDsSrc2Insts
bool HasDsSrc2Insts
Definition: AMDGPUSubtarget.h:67

llvm::AMDGPUSubtarget::getEUsPerCU
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
Definition: AMDGPUSubtarget.h:334

llvm::AMDGPUSubtarget::hasSMulHi
bool hasSMulHi() const
Definition: AMDGPUSubtarget.h:283

llvm::AMDGPUSubtarget::isMesaKernel
bool isMesaKernel(const Function &F) const
Definition: AMDGPUSubtarget.cpp:259

llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition: AMDGPUSubtarget.cpp:209

llvm::AMDGPUSubtarget::HasBF16PackedInsts
bool HasBF16PackedInsts
Definition: AMDGPUSubtarget.h:64

llvm::AMDGPUSubtarget::HasTrue16BitInsts
bool HasTrue16BitInsts
Definition: AMDGPUSubtarget.h:53

llvm::AMDGPUSubtarget::hasCvtPkF16F32Inst
bool hasCvtPkF16F32Inst() const
Definition: AMDGPUSubtarget.h:253

llvm::AMDGPUSubtarget::useRealTrue16Insts
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Definition: AMDGPUSubtarget.cpp:37

llvm::AMDGPUSubtarget::getOccupancyWithWorkGroupSizes
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
Definition: AMDGPUSubtarget.h:169

llvm::AMDGPUSubtarget::Has16BitInsts
bool Has16BitInsts
Definition: AMDGPUSubtarget.h:52

llvm::AMDGPUSubtarget::getMinWavesPerEU
virtual unsigned getMinWavesPerEU() const =0

llvm::AMDGPUSubtarget::hasBF16ConversionInsts
bool hasBF16ConversionInsts() const
Definition: AMDGPUSubtarget.h:229

llvm::AMDGPUSubtarget::HasFP4ConversionScaleInsts
bool HasFP4ConversionScaleInsts
Definition: AMDGPUSubtarget.h:56

llvm::AMDGPUSubtarget::hasFP6BF6ConversionScaleInsts
bool hasFP6BF6ConversionScaleInsts() const
Definition: AMDGPUSubtarget.h:245

llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:159

llvm::AMDGPUSubtarget::HasFP6BF6ConversionScaleInsts
bool HasFP6BF6ConversionScaleInsts
Definition: AMDGPUSubtarget.h:57

llvm::AMDGPUSubtarget::Generation
Generation
Definition: AMDGPUSubtarget.h:32

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:42

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition: AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::EVERGREEN
@ EVERGREEN
Definition: AMDGPUSubtarget.h:36

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition: AMDGPUSubtarget.h:44

llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition: AMDGPUSubtarget.h:33

llvm::AMDGPUSubtarget::R700
@ R700
Definition: AMDGPUSubtarget.h:35

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::NORTHERN_ISLANDS
@ NORTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:37

llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:38

llvm::AMDGPUSubtarget::R600
@ R600
Definition: AMDGPUSubtarget.h:34

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::GFX11
@ GFX11
Definition: AMDGPUSubtarget.h:43

llvm::AMDGPUSubtarget::EUsPerCU
unsigned EUsPerCU
Definition: AMDGPUSubtarget.h:78

llvm::AMDGPUSubtarget::makeLIDRangeMetadata
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition: AMDGPUSubtarget.cpp:280

llvm::AMDGPUSubtarget::hasBF8ConversionScaleInsts
bool hasBF8ConversionScaleInsts() const
Definition: AMDGPUSubtarget.h:241

llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition: AMDGPUSubtarget.cpp:263

llvm::AMDGPUSubtarget::getImplicitArgNumBytes
unsigned getImplicitArgNumBytes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:348

llvm::AMDGPUSubtarget::getLocalMemorySize
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
Definition: AMDGPUSubtarget.h:319

llvm::AMDGPUSubtarget::getAddressableLocalMemorySize
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
Definition: AMDGPUSubtarget.h:327

llvm::AMDGPUSubtarget::getMaxNumWorkGroups
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
Definition: AMDGPUSubtarget.cpp:434

llvm::AMDGPUSubtarget::isGCN3Encoding
bool isGCN3Encoding() const
Definition: AMDGPUSubtarget.h:208

llvm::AMDGPUSubtarget::getWavesPerEUForWorkGroup
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0

llvm::AMDGPUSubtarget::getMaxWorkGroupsPerCU
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0

llvm::AMDGPUSubtarget::hasMadMixInsts
bool hasMadMixInsts() const
Definition: AMDGPUSubtarget.h:235

llvm::AMDGPUSubtarget::getWavefrontSizeLog2
unsigned getWavefrontSizeLog2() const
Definition: AMDGPUSubtarget.h:311

llvm::AMDGPUSubtarget::HasSMulHi
bool HasSMulHi
Definition: AMDGPUSubtarget.h:72

llvm::AMDGPUSubtarget::getKernArgSegmentSize
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:392

llvm::AMDGPUSubtarget::HasF16BF16ToFP6BF6ConversionScaleInsts
bool HasF16BF16ToFP6BF6ConversionScaleInsts
Definition: AMDGPUSubtarget.h:58

llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:212

llvm::AMDGPUSubtarget::HasF32ToF16BF16ConversionSRInsts
bool HasF32ToF16BF16ConversionSRInsts
Definition: AMDGPUSubtarget.h:60

llvm::AMDGPUSubtarget::GCN3Encoding
bool GCN3Encoding
Definition: AMDGPUSubtarget.h:51

llvm::AMDGPUSubtarget::FastFMAF32
bool FastFMAF32
Definition: AMDGPUSubtarget.h:77

llvm::AMDGPUSubtarget::~AMDGPUSubtarget
virtual ~AMDGPUSubtarget()=default

llvm::AMDGPUSubtarget::hasTrue16BitInsts
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
Definition: AMDGPUSubtarget.h:217

llvm::AMDGPUSubtarget::isAmdHsaOrMesa
bool isAmdHsaOrMesa(const Function &F) const
Definition: AMDGPUSubtarget.h:202

llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition: AMDGPUSubtarget.h:80

llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition: AMDGPUSubtarget.h:79

llvm::AMDGPUSubtarget::HasMulU24
bool HasMulU24
Definition: AMDGPUSubtarget.h:71

llvm::AMDGPUSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: AMDGPUSubtarget.h:299

llvm::AMDGPUSubtarget::HasMulI24
bool HasMulI24
Definition: AMDGPUSubtarget.h:70

llvm::AMDGPUSubtarget::HasTrigReducedRange
bool HasTrigReducedRange
Definition: AMDGPUSubtarget.h:76

llvm::AMDGPUSubtarget::isPromoteAllocaEnabled
bool isPromoteAllocaEnabled() const
Definition: AMDGPUSubtarget.h:303

llvm::AMDGPUSubtarget::hasTrigReducedRange
bool hasTrigReducedRange() const
Definition: AMDGPUSubtarget.h:295

llvm::AMDGPUSubtarget::getAMDGPUDwarfFlavour
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition: AMDGPUSubtarget.cpp:414

llvm::AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition: AMDGPUSubtarget.cpp:45

llvm::AMDGPUSubtarget::getMaxFlatWorkGroupSize
virtual unsigned getMaxFlatWorkGroupSize() const =0

llvm::AMDGPUSubtarget::hasDsSrc2Insts
bool hasDsSrc2Insts() const
Definition: AMDGPUSubtarget.h:263

llvm::AMDGPUSubtarget::getExplicitKernelArgOffset
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
Definition: AMDGPUSubtarget.h:342

llvm::AMDGPUSubtarget::hasF16BF16ToFP6BF6ConversionScaleInsts
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const
Definition: AMDGPUSubtarget.h:249

llvm::AMDGPUSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:379

llvm::AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
Definition: AMDGPUSubtarget.cpp:241

llvm::AMDGPUSubtarget::hasMulU24
bool hasMulU24() const
Definition: AMDGPUSubtarget.h:279

llvm::AMDGPUSubtarget::HasInv2PiInlineImm
bool HasInv2PiInlineImm
Definition: AMDGPUSubtarget.h:73

llvm::AMDGPUSubtarget::getExplicitKernArgSize
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:367

llvm::AMDGPUSubtarget::EnablePromoteAlloca
bool EnablePromoteAlloca
Definition: AMDGPUSubtarget.h:75

llvm::AMDGPUSubtarget::HasFP8ConversionScaleInsts
bool HasFP8ConversionScaleInsts
Definition: AMDGPUSubtarget.h:54

llvm::AMDGPUSubtarget::AddressableLocalMemorySize
unsigned AddressableLocalMemorySize
Definition: AMDGPUSubtarget.h:81

llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition: AMDGPUSubtarget.h:188

llvm::AMDGPUSubtarget::HasCvtPkF16F32Inst
bool HasCvtPkF16F32Inst
Definition: AMDGPUSubtarget.h:59

llvm::AMDGPUSubtarget::HasVOP3PInsts
bool HasVOP3PInsts
Definition: AMDGPUSubtarget.h:69

llvm::AMDGPUSubtarget::isSingleLaneExecution
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
Definition: AMDGPUSubtarget.cpp:271

llvm::AMDGPUSubtarget::isGCN
bool isGCN() const
Definition: AMDGPUSubtarget.h:206

llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:419

llvm::AMDGPUSubtarget::HasBF8ConversionScaleInsts
bool HasBF8ConversionScaleInsts
Definition: AMDGPUSubtarget.h:55

llvm::AMDGPUSubtarget::HasSDWA
bool HasSDWA
Definition: AMDGPUSubtarget.h:68

llvm::AMDGPUSubtarget::HasMadMacF32Insts
bool HasMadMacF32Insts
Definition: AMDGPUSubtarget.h:66

llvm::AMDGPUSubtarget::hasBF16TransInsts
bool hasBF16TransInsts() const
Definition: AMDGPUSubtarget.h:227

llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:307

llvm::AMDGPUSubtarget::getMinFlatWorkGroupSize
virtual unsigned getMinFlatWorkGroupSize() const =0

llvm::AMDGPUSubtarget::getEffectiveWavesPerEU
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
Definition: AMDGPUSubtarget.cpp:182

llvm::AMDGPUSubtarget::HasBF16ConversionInsts
bool HasBF16ConversionInsts
Definition: AMDGPUSubtarget.h:63

llvm::AMDGPUSubtarget::hasInv2PiInlineImm
bool hasInv2PiInlineImm() const
Definition: AMDGPUSubtarget.h:287

llvm::AMDGPUSubtarget::hasF32ToF16BF16ConversionSRInsts
bool hasF32ToF16BF16ConversionSRInsts() const
Definition: AMDGPUSubtarget.h:255

llvm::AMDGPUSubtarget::HasMadMixInsts
bool HasMadMixInsts
Definition: AMDGPUSubtarget.h:65

llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
Overload which uses the specified values for the flat work group sizes, rather than querying the func...

llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:271

llvm::AMDGPUSubtarget::hasMulI24
bool hasMulI24() const
Definition: AMDGPUSubtarget.h:275

llvm::Function
Definition: Function.h:64

llvm::Instruction
Definition: Instruction.h:69

llvm::MachineFunction
Definition: MachineFunction.h:286

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47

llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:229

llvm::Triple::UnknownOS
@ UnknownOS
Definition: Triple.h:206

llvm::Triple::AMDPAL
@ AMDPAL
Definition: Triple.h:239

llvm::Triple::Mesa3D
@ Mesa3D
Definition: Triple.h:238

llvm::Triple::getOS
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:417

llvm::Triple::isAMDGCN
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition: Triple.h:901

uint32_t

uint64_t

unsigned

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:164

llvm::codeview::PublicSymFlags::Function
@ Function

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::AMDGPUDwarfFlavour
AMDGPUDwarfFlavour
Definition: AMDGPUMCTargetDesc.h:32

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39