LLVM: lib/Target/AMDGPU/AMDGPUSubtarget.cpp Source File

//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Implements the AMDGPU specific subclass of TargetSubtarget.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUSubtarget.h"

#include "AMDGPUCallLowering.h"

#include "AMDGPUInstructionSelector.h"

#include "AMDGPULegalizerInfo.h"

#include "AMDGPURegisterBankInfo.h"

#include "R600Subtarget.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"

#include "llvm/CodeGen/MachineScheduler.h"

#include "llvm/CodeGen/TargetFrameLowering.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/IntrinsicsR600.h"

#include "llvm/IR/MDBuilder.h"

#include <algorithm>


using namespace llvm;


#define DEBUG_TYPE "amdgpu-subtarget"


AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}


bool AMDGPUSubtarget::useRealTrue16Insts() const {

  return hasTrue16BitInsts() && EnableRealTrue16Insts;

}


bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const {

  return EnableD16Writes32BitVgpr;

}


// Returns the maximum per-workgroup LDS allocation size (in bytes) that still

// allows the given function to achieve an occupancy of NWaves waves per

// SIMD / EU, taking into account only the function's *maximum* workgroup size.

unsigned


AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,

                                                 const Function &F) const {

  const unsigned WaveSize = getWavefrontSize();

  const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;

  const unsigned WavesPerWorkgroup =

      std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);


  const unsigned WorkGroupsPerCU =

      std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);


  return getLocalMemorySize() / WorkGroupsPerCU;

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(

    uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {


  // FIXME: We should take into account the LDS allocation granularity.

  const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);


  // Queried LDS size may be larger than available on a CU, in which case we

  // consider the only achievable occupancy to be 1, in line with what we

  // consider the occupancy to be when the number of requested registers in a

  // particular bank is higher than the number of available ones in that bank.

  if (!MaxWGsLDS)

    return {1, 1};


  const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();


  auto PropsFromWGSize = [=](unsigned WGSize)

      -> std::tuple<const unsigned, const unsigned, unsigned> {

    unsigned WavesPerWG = divideCeil(WGSize, WaveSize);

    unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);

    return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};

  };


  // The maximum group size will generally yield the minimum number of

  // workgroups, maximum number of waves, and minimum occupancy. The opposite is

  // generally true for the minimum group size. LDS or barrier ressource

  // limitations can flip those minimums/maximums.

  const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;

  auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);

  auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);


  // It is possible that we end up with flipped minimum and maximum number of

  // waves per CU when the number of minimum/maximum concurrent groups on the CU

  // is limited by LDS usage or barrier resources.

  if (MinWavesPerCU >= MaxWavesPerCU) {

    std::swap(MinWavesPerCU, MaxWavesPerCU);

  } else {

    const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();


    // Look for a potential smaller group size than the maximum which decreases

    // the concurrent number of waves on the CU for the same number of

    // concurrent workgroups on the CU.

    unsigned MinWavesPerCUForWGSize =

        divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;

    if (MinWavesPerCU > MinWavesPerCUForWGSize) {

      unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;

      if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {

        // There may exist a smaller group size than the maximum that achieves

        // the minimum number of waves per CU. This group size is the largest

        // possible size that requires MaxWavesPerWG - E waves where E is

        // maximized under the following constraints.

        // 1. 0 <= E <= ExcessSlotsPerWG

        // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize

        MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,

                                                MaxWavesPerWG - MinWavesPerWG);

      }

    }


    // Look for a potential larger group size than the minimum which increases

    // the concurrent number of waves on the CU for the same number of

    // concurrent workgroups on the CU.

    unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;

    if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {

      // There may exist a larger group size than the minimum that achieves the

      // maximum number of waves per CU. This group size is the smallest

      // possible size that requires MinWavesPerWG + L waves where L is

      // maximized under the following constraints.

      // 1. 0 <= L <= LeftoverSlotsPerWG

      // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize

      MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,

                                              ((MaxWGSize - 1) / WaveSize) + 1 -

                                                  MinWavesPerWG);

    }

  }


  // Return the minimum/maximum number of waves on any EU, assuming that all

  // wavefronts are spread across all EUs as evenly as possible.

  return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),

          std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(

    const MachineFunction &MF) const {

  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();

  return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());

}


std::pair<unsigned, unsigned>


AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {

  switch (CC) {

  case CallingConv::AMDGPU_VS:

  case CallingConv::AMDGPU_LS:

  case CallingConv::AMDGPU_HS:

  case CallingConv::AMDGPU_ES:

  case CallingConv::AMDGPU_GS:

  case CallingConv::AMDGPU_PS:

    return std::pair(1, getWavefrontSize());

  default:

    return std::pair(1u, getMaxFlatWorkGroupSize());

  }

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(

  const Function &F) const {

  // Default minimum/maximum flat work group sizes.

  std::pair<unsigned, unsigned> Default =

    getDefaultFlatWorkGroupSize(F.getCallingConv());


  // Requested minimum/maximum flat work group sizes.

  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(

    F, "amdgpu-flat-work-group-size", Default);


  // Make sure requested minimum is less than requested maximum.

  if (Requested.first > Requested.second)

    return Default;


  // Make sure requested values do not violate subtarget's specifications.

  if (Requested.first < getMinFlatWorkGroupSize())

    return Default;

  if (Requested.second > getMaxFlatWorkGroupSize())

    return Default;


  return Requested;

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(

    std::pair<unsigned, unsigned> RequestedWavesPerEU,

    std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {

  // Default minimum/maximum number of waves per EU. The range of flat workgroup

  // sizes limits the achievable maximum, and we aim to support enough waves per

  // EU so that we can concurrently execute all waves of a single workgroup of

  // maximum size on a CU.

  std::pair<unsigned, unsigned> Default = {

      getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),

      getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};

  Default.first = std::min(Default.first, Default.second);


  // Make sure requested minimum is within the default range and lower than the

  // requested maximum. The latter must not violate target specification.

  if (RequestedWavesPerEU.first < Default.first ||

      RequestedWavesPerEU.first > Default.second ||

      RequestedWavesPerEU.first > RequestedWavesPerEU.second ||

      RequestedWavesPerEU.second > getMaxWavesPerEU())

    return Default;


  // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.

  RequestedWavesPerEU.second =

      std::min(RequestedWavesPerEU.second, Default.second);

  return RequestedWavesPerEU;

}


std::pair<unsigned, unsigned>


AMDGPUSubtarget::getWavesPerEU(const Function &F) const {

  // Default/requested minimum/maximum flat work group sizes.

  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);

  // Minimum number of bytes allocated in the LDS.

  unsigned LDSBytes =

      AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", {0, UINT32_MAX},

                                      /*OnlyFirstRequired=*/true)

          .first;

  return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);

}


std::pair<unsigned, unsigned>


AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,

                               unsigned LDSBytes, const Function &F) const {

  // Default minimum/maximum number of waves per execution unit.

  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());


  // Requested minimum/maximum number of waves per execution unit.

  std::pair<unsigned, unsigned> Requested =

      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);

  return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);

}


std::optional<unsigned>


AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,

                                      unsigned Dim) const {

  auto *Node = Kernel.getMetadata("reqd_work_group_size");

  if (Node && Node->getNumOperands() == 3)

    return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();

  return std::nullopt;

}


bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(

    const Function &F, bool RequiresUniformYZ) const {

  auto *Node = F.getMetadata("reqd_work_group_size");

  if (!Node || Node->getNumOperands() != 3)

    return false;

  unsigned XLen =

      mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue();

  unsigned YLen =

      mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue();

  unsigned ZLen =

      mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();


  bool Is1D = YLen <= 1 && ZLen <= 1;

  bool IsXLargeEnough =

      isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());

  return Is1D || IsXLargeEnough;

}


bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {

  return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());

}


unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,

                                           unsigned Dimension) const {

  std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);

  if (ReqdSize)

    return *ReqdSize - 1;

  return getFlatWorkGroupSizes(Kernel).second - 1;

}


bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {

  for (int I = 0; I < 3; ++I) {

    if (getMaxWorkitemID(Func, I) > 0)

      return false;

  }


  return true;

}


bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {

  Function *Kernel = I->getParent()->getParent();

  unsigned MinSize = 0;

  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;

  bool IdQuery = false;


  // If reqd_work_group_size is present it narrows value down.

  if (auto *CI = dyn_cast<CallInst>(I)) {

    const Function *F = CI->getCalledFunction();

    if (F) {

      unsigned Dim = UINT_MAX;

      switch (F->getIntrinsicID()) {

      case Intrinsic::amdgcn_workitem_id_x:

      case Intrinsic::r600_read_tidig_x:

        IdQuery = true;

        [[fallthrough]];

      case Intrinsic::r600_read_local_size_x:

        Dim = 0;

        break;

      case Intrinsic::amdgcn_workitem_id_y:

      case Intrinsic::r600_read_tidig_y:

        IdQuery = true;

        [[fallthrough]];

      case Intrinsic::r600_read_local_size_y:

        Dim = 1;

        break;

      case Intrinsic::amdgcn_workitem_id_z:

      case Intrinsic::r600_read_tidig_z:

        IdQuery = true;

        [[fallthrough]];

      case Intrinsic::r600_read_local_size_z:

        Dim = 2;

        break;

      default:

        break;

      }


      if (Dim <= 3) {

        std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);

        if (ReqdSize)

          MinSize = MaxSize = *ReqdSize;

      }

    }

  }


  if (!MaxSize)

    return false;


  // Range metadata is [Lo, Hi). For ID query we need to pass max size

  // as Hi. For size query we need to pass Hi + 1.

  if (IdQuery)

    MinSize = 0;

  else

    ++MaxSize;


  APInt Lower{32, MinSize};

  APInt Upper{32, MaxSize};

  if (auto *CI = dyn_cast<CallBase>(I)) {

    ConstantRange Range(Lower, Upper);

    CI->addRangeRetAttr(Range);

  } else {

    MDBuilder MDB(I->getContext());

    MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);

    I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);

  }

  return true;

}


unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {

  assert(AMDGPU::isKernel(F.getCallingConv()));


  // We don't allocate the segment if we know the implicit arguments weren't

  // used, even if the ABI implies we need them.

  if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))

    return 0;


  if (isMesaKernel(F))

    return 16;


  // Assume all implicit inputs are used by default

  const Module *M = F.getParent();

  unsigned NBytes =

      AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;

  return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",

                                         NBytes);

}


uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,

                                                 Align &MaxAlign) const {

  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||

         F.getCallingConv() == CallingConv::SPIR_KERNEL);


  const DataLayout &DL = F.getDataLayout();

  uint64_t ExplicitArgBytes = 0;

  MaxAlign = Align(1);


  for (const Argument &Arg : F.args()) {

    if (Arg.hasAttribute("amdgpu-hidden-argument"))

      continue;


    const bool IsByRef = Arg.hasByRefAttr();

    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();

    Align Alignment = DL.getValueOrABITypeAlignment(

        IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);

    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

    ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;

    MaxAlign = std::max(MaxAlign, Alignment);

  }


  return ExplicitArgBytes;

}


unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,

                                                Align &MaxAlign) const {

  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&

      F.getCallingConv() != CallingConv::SPIR_KERNEL)

    return 0;


  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);


  unsigned ExplicitOffset = getExplicitKernelArgOffset();


  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;

  unsigned ImplicitBytes = getImplicitArgNumBytes(F);

  if (ImplicitBytes != 0) {

    const Align Alignment = getAlignmentForImplicitArgPtr();

    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;

    MaxAlign = std::max(MaxAlign, Alignment);

  }


  // Being able to dereference past the end is useful for emitting scalar loads.

  return alignTo(TotalSize, 4);

}


AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {

  return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32

                                  : AMDGPUDwarfFlavour::Wave64;

}


const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {

  if (MF.getTarget().getTargetTriple().isAMDGCN())

    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());

  return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());

}


const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {

  if (TM.getTargetTriple().isAMDGCN())

    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));

  return static_cast<const AMDGPUSubtarget &>(

      TM.getSubtarget<R600Subtarget>(F));

}


// FIXME: This has no reason to be in subtarget

SmallVector<unsigned>


AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {

  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,

                                        std::numeric_limits<uint32_t>::max());

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUBaseInfo.h

AMDGPUCallLowering.h
This file describes how to lower LLVM calls to machine code calls.

AMDGPUInstructionSelector.h
This file declares the targeting of the InstructionSelector class for AMDGPU.

AMDGPULegalizerInfo.h
This file declares the targeting of the Machinelegalizer class for AMDGPU.

AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.

AMDGPUSubtarget.h
Base class for AMDGPU specific classes of TargetSubtarget.

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

DiagnosticInfo.h

InlineAsmLowering.h
This file describes how to lower LLVM inline asm to machine code INLINEASM.

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

MDBuilder.h

MachineScheduler.h

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

R600Subtarget.h
AMDGPU R600 specific subclass of TargetSubtarget.

SIMachineFunctionInfo.h

TargetFrameLowering.h

Node
Definition ItaniumDemangle.h:166

llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition AMDGPUSubtarget.h:197

llvm::AMDGPUSubtarget::getDefaultFlatWorkGroupSize
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition AMDGPUSubtarget.cpp:149

llvm::AMDGPUSubtarget::EnableRealTrue16Insts
bool EnableRealTrue16Insts
Definition AMDGPUSubtarget.h:61

llvm::AMDGPUSubtarget::getReqdWorkGroupSize
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Definition AMDGPUSubtarget.cpp:237

llvm::AMDGPUSubtarget::getAlignmentForImplicitArgPtr
Align getAlignmentForImplicitArgPtr() const
Definition AMDGPUSubtarget.h:339

llvm::AMDGPUSubtarget::getEUsPerCU
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
Definition AMDGPUSubtarget.h:337

llvm::AMDGPUSubtarget::EnableD16Writes32BitVgpr
bool EnableD16Writes32BitVgpr
Definition AMDGPUSubtarget.h:62

llvm::AMDGPUSubtarget::isMesaKernel
bool isMesaKernel(const Function &F) const
Definition AMDGPUSubtarget.cpp:263

llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition AMDGPUSubtarget.cpp:213

llvm::AMDGPUSubtarget::useRealTrue16Insts
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Definition AMDGPUSubtarget.cpp:37

llvm::AMDGPUSubtarget::getOccupancyWithWorkGroupSizes
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
Definition AMDGPUSubtarget.h:170

llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition AMDGPUSubtarget.cpp:163

llvm::AMDGPUSubtarget::makeLIDRangeMetadata
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition AMDGPUSubtarget.cpp:284

llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition AMDGPUSubtarget.cpp:267

llvm::AMDGPUSubtarget::getImplicitArgNumBytes
unsigned getImplicitArgNumBytes(const Function &F) const
Definition AMDGPUSubtarget.cpp:352

llvm::AMDGPUSubtarget::getLocalMemorySize
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
Definition AMDGPUSubtarget.h:322

llvm::AMDGPUSubtarget::getMaxNumWorkGroups
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
Definition AMDGPUSubtarget.cpp:438

llvm::AMDGPUSubtarget::getWavesPerEUForWorkGroup
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0

llvm::AMDGPUSubtarget::getMaxWorkGroupsPerCU
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0

llvm::AMDGPUSubtarget::getKernArgSegmentSize
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition AMDGPUSubtarget.cpp:396

llvm::AMDGPUSubtarget::hasTrue16BitInsts
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
Definition AMDGPUSubtarget.h:218

llvm::AMDGPUSubtarget::getAMDGPUDwarfFlavour
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition AMDGPUSubtarget.cpp:418

llvm::AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition AMDGPUSubtarget.cpp:49

llvm::AMDGPUSubtarget::getMaxFlatWorkGroupSize
virtual unsigned getMaxFlatWorkGroupSize() const =0

llvm::AMDGPUSubtarget::AMDGPUSubtarget
AMDGPUSubtarget(Triple TT)
Definition AMDGPUSubtarget.cpp:35

llvm::AMDGPUSubtarget::getExplicitKernelArgOffset
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
Definition AMDGPUSubtarget.h:345

llvm::AMDGPUSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition AMDGPUSubtarget.h:382

llvm::AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
Definition AMDGPUSubtarget.cpp:245

llvm::AMDGPUSubtarget::getExplicitKernArgSize
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition AMDGPUSubtarget.cpp:371

llvm::AMDGPUSubtarget::isSingleLaneExecution
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
Definition AMDGPUSubtarget.cpp:275

llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition AMDGPUSubtarget.cpp:423

llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition AMDGPUSubtarget.h:310

llvm::AMDGPUSubtarget::getMinFlatWorkGroupSize
virtual unsigned getMinFlatWorkGroupSize() const =0

llvm::AMDGPUSubtarget::getEffectiveWavesPerEU
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
Definition AMDGPUSubtarget.cpp:186

llvm::AMDGPUSubtarget::hasD16Writes32BitVgpr
bool hasD16Writes32BitVgpr() const
Definition AMDGPUSubtarget.cpp:41

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::ConstantRange
This class represents a range of values.
Definition ConstantRange.h:47

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::Function
Definition Function.h:64

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::GlobalObject::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition Value.h:576

llvm::Instruction
Definition Instruction.h:69

llvm::MDBuilder
Definition MDBuilder.h:37

llvm::MDBuilder::createRange
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96

llvm::MDNode
Metadata node.
Definition Metadata.h:1077

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:860

llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition MachineFunction.h:758

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::R600Subtarget
Definition R600Subtarget.h:29

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:412

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1197

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition TargetMachine.h:132

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47

llvm::Triple::isAMDGCN
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:904

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

uint32_t

uint64_t

llvm::AMDGPU::isShader
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1425

llvm::AMDGPU::AMDHSA_COV5
@ AMDHSA_COV5
Definition AMDGPUBaseInfo.h:61

llvm::AMDGPU::getAMDHSACodeObjectVersion
unsigned getAMDHSACodeObjectVersion(const Module &M)
Definition AMDGPUBaseInfo.cpp:202

llvm::AMDGPU::isKernel
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1499

llvm::AMDGPU::getIntegerVecAttribute
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size, unsigned DefaultVal)
Definition AMDGPUBaseInfo.cpp:1648

llvm::AMDGPU::getIntegerPairAttribute
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
Definition AMDGPUBaseInfo.cpp:1613

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition CallingConv.h:188

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition CallingConv.h:206

llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition CallingConv.h:191

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition CallingConv.h:194

llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition CallingConv.h:144

llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition CallingConv.h:218

llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition CallingConv.h:213

llvm::mdconst::extract
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:666

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649

llvm::HexPrintStyle::Upper
@ Upper
Definition NativeFormatting.h:23

llvm::HexPrintStyle::Lower
@ Lower
Definition NativeFormatting.h:23

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155

llvm::move
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1847

llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::AMDGPUDwarfFlavour
AMDGPUDwarfFlavour
Definition AMDGPUMCTargetDesc.h:33

llvm::Wave32
@ Wave32
Definition AMDGPUMCTargetDesc.h:33

llvm::Wave64
@ Wave64
Definition AMDGPUMCTargetDesc.h:33

std
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:851

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39