LLVM: lib/Target/AMDGPU/GCNSubtarget.cpp Source File

//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Implements the GCN specific subclass of TargetSubtarget.

//

//===----------------------------------------------------------------------===//


#include "GCNSubtarget.h"

#include "AMDGPUCallLowering.h"

#include "AMDGPUInstructionSelector.h"

#include "AMDGPULegalizerInfo.h"

#include "AMDGPURegisterBankInfo.h"

#include "AMDGPUSelectionDAGInfo.h"

#include "AMDGPUTargetMachine.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/ADT/SmallString.h"

#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"

#include "llvm/CodeGen/MachineScheduler.h"

#include "llvm/CodeGen/TargetFrameLowering.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/MDBuilder.h"

#include <algorithm>


using namespace llvm;


#define DEBUG_TYPE "gcn-subtarget"


#define GET_SUBTARGETINFO_TARGET_DESC

#define GET_SUBTARGETINFO_CTOR

#define AMDGPUSubtarget GCNSubtarget

#include "AMDGPUGenSubtargetInfo.inc"

#undef AMDGPUSubtarget


static cl::opt<bool> EnableVGPRIndexMode(

    "amdgpu-vgpr-index-mode",

    cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),

    cl::init(false));


static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",

                           cl::desc("Enable the use of AA during codegen."),

                           cl::init(true));


static cl::opt<unsigned>

    NSAThreshold("amdgpu-nsa-threshold",

                 cl::desc("Number of addresses from which to enable MIMG NSA."),

                 cl::init(2), cl::Hidden);


GCNSubtarget::~GCNSubtarget() = default;


GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,

                                                            StringRef GPU,

                                                            StringRef FS) {

  // Determine default and user-specified characteristics

  //

  // We want to be able to turn these off, but making this a subtarget feature

  // for SI has the unhelpful behavior that it unsets everything else if you

  // disable it.

  //

  // Similarly we want enable-prt-strict-null to be on by default and not to

  // unset everything else if it is disabled


  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");


  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by

  // default

  if (isAmdHsaOS())

    FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";


  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS


  // Disable mutually exclusive bits.

  if (FS.contains_insensitive("+wavefrontsize")) {

    if (!FS.contains_insensitive("wavefrontsize16"))

      FullFS += "-wavefrontsize16,";

    if (!FS.contains_insensitive("wavefrontsize32"))

      FullFS += "-wavefrontsize32,";

    if (!FS.contains_insensitive("wavefrontsize64"))

      FullFS += "-wavefrontsize64,";

  }


  FullFS += FS;


  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);


  // Implement the "generic" processors, which acts as the default when no

  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to

  // the first amdgcn target that supports flat addressing. Other OSes defaults

  // to the first amdgcn target.

  if (Gen == AMDGPUSubtarget::INVALID) {

    Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS

                                       : AMDGPUSubtarget::SOUTHERN_ISLANDS;

    // Assume wave64 for the unknown target, if not explicitly set.

    if (getWavefrontSizeLog2() == 0)

      WavefrontSizeLog2 = 6;

  } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&

             !hasFeature(AMDGPU::FeatureWavefrontSize64)) {

    // If there is no default wave size it must be a generation before gfx10,

    // these have FeatureWavefrontSize64 in their definition already. For gfx10+

    // set wave32 as a default.

    ToggleFeature(AMDGPU::FeatureWavefrontSize32);

    WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;

  }


  // We don't support FP64 for EG/NI atm.

  assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));


  // Targets must either support 64-bit offsets for MUBUF instructions, and/or

  // support flat operations, otherwise they cannot access a 64-bit global

  // address space

  assert(hasAddr64() || hasFlat());

  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets

  // that do not support ADDR64 variants of MUBUF instructions. Such targets

  // cannot use a 64 bit offset with a MUBUF instruction to access the global

  // address space

  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {

    ToggleFeature(AMDGPU::FeatureFlatForGlobal);

    FlatForGlobal = true;

  }

  // Unless +-flat-for-global is specified, use MUBUF instructions for global

  // address space access if flat operations are not available.

  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {

    ToggleFeature(AMDGPU::FeatureFlatForGlobal);

    FlatForGlobal = false;

  }


  // Set defaults if needed.

  if (MaxPrivateElementSize == 0)

    MaxPrivateElementSize = 4;


  if (LDSBankCount == 0)

    LDSBankCount = 32;


  if (AddressableLocalMemorySize == 0)

    AddressableLocalMemorySize = 32768;


  LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(this);


  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;

  HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;


  TargetID.setTargetIDFromFeaturesString(FS);


  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "

                    << TargetID.getXnackSetting() << '\n');

  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "

                    << TargetID.getSramEccSetting() << '\n');


  return *this;

}


void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {

  LLVMContext &Ctx = F.getContext();

  if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&

      hasFeature(AMDGPU::FeatureWavefrontSize64)) {

    Ctx.diagnose(DiagnosticInfoUnsupported(

        F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));

  }

}


GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,

                           const GCNTargetMachine &TM)

    : // clang-format off

    AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),

    AMDGPUSubtarget(TT),

    TargetTriple(TT),

    TargetID(*this),

    InstrItins(getInstrItineraryForCPU(GPU)),

    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),

    TLInfo(TM, *this),

    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {

  // clang-format on

  MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);

  EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);


  TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();


  CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());

  InlineAsmLoweringInfo =

      std::make_unique<InlineAsmLowering>(getTargetLowering());

  Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);

  RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);

  InstSelector =

      std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);

}


const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {

  return TSInfo.get();

}


unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {

  if (getGeneration() < GFX10)

    return 1;


  switch (Opcode) {

  case AMDGPU::V_LSHLREV_B64_e64:

  case AMDGPU::V_LSHLREV_B64_gfx10:

  case AMDGPU::V_LSHLREV_B64_e64_gfx11:

  case AMDGPU::V_LSHLREV_B64_e32_gfx12:

  case AMDGPU::V_LSHLREV_B64_e64_gfx12:

  case AMDGPU::V_LSHL_B64_e64:

  case AMDGPU::V_LSHRREV_B64_e64:

  case AMDGPU::V_LSHRREV_B64_gfx10:

  case AMDGPU::V_LSHRREV_B64_e64_gfx11:

  case AMDGPU::V_LSHRREV_B64_e64_gfx12:

  case AMDGPU::V_LSHR_B64_e64:

  case AMDGPU::V_ASHRREV_I64_e64:

  case AMDGPU::V_ASHRREV_I64_gfx10:

  case AMDGPU::V_ASHRREV_I64_e64_gfx11:

  case AMDGPU::V_ASHRREV_I64_e64_gfx12:

  case AMDGPU::V_ASHR_I64_e64:

    return 1;

  }


  return 2;

}


/// This list was mostly derived from experimentation.

bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {

  switch (Opcode) {

  case AMDGPU::V_CVT_F16_F32_e32:

  case AMDGPU::V_CVT_F16_F32_e64:

  case AMDGPU::V_CVT_F16_U16_e32:

  case AMDGPU::V_CVT_F16_U16_e64:

  case AMDGPU::V_CVT_F16_I16_e32:

  case AMDGPU::V_CVT_F16_I16_e64:

  case AMDGPU::V_RCP_F16_e64:

  case AMDGPU::V_RCP_F16_e32:

  case AMDGPU::V_RSQ_F16_e64:

  case AMDGPU::V_RSQ_F16_e32:

  case AMDGPU::V_SQRT_F16_e64:

  case AMDGPU::V_SQRT_F16_e32:

  case AMDGPU::V_LOG_F16_e64:

  case AMDGPU::V_LOG_F16_e32:

  case AMDGPU::V_EXP_F16_e64:

  case AMDGPU::V_EXP_F16_e32:

  case AMDGPU::V_SIN_F16_e64:

  case AMDGPU::V_SIN_F16_e32:

  case AMDGPU::V_COS_F16_e64:

  case AMDGPU::V_COS_F16_e32:

  case AMDGPU::V_FLOOR_F16_e64:

  case AMDGPU::V_FLOOR_F16_e32:

  case AMDGPU::V_CEIL_F16_e64:

  case AMDGPU::V_CEIL_F16_e32:

  case AMDGPU::V_TRUNC_F16_e64:

  case AMDGPU::V_TRUNC_F16_e32:

  case AMDGPU::V_RNDNE_F16_e64:

  case AMDGPU::V_RNDNE_F16_e32:

  case AMDGPU::V_FRACT_F16_e64:

  case AMDGPU::V_FRACT_F16_e32:

  case AMDGPU::V_FREXP_MANT_F16_e64:

  case AMDGPU::V_FREXP_MANT_F16_e32:

  case AMDGPU::V_FREXP_EXP_I16_F16_e64:

  case AMDGPU::V_FREXP_EXP_I16_F16_e32:

  case AMDGPU::V_LDEXP_F16_e64:

  case AMDGPU::V_LDEXP_F16_e32:

  case AMDGPU::V_LSHLREV_B16_e64:

  case AMDGPU::V_LSHLREV_B16_e32:

  case AMDGPU::V_LSHRREV_B16_e64:

  case AMDGPU::V_LSHRREV_B16_e32:

  case AMDGPU::V_ASHRREV_I16_e64:

  case AMDGPU::V_ASHRREV_I16_e32:

  case AMDGPU::V_ADD_U16_e64:

  case AMDGPU::V_ADD_U16_e32:

  case AMDGPU::V_SUB_U16_e64:

  case AMDGPU::V_SUB_U16_e32:

  case AMDGPU::V_SUBREV_U16_e64:

  case AMDGPU::V_SUBREV_U16_e32:

  case AMDGPU::V_MUL_LO_U16_e64:

  case AMDGPU::V_MUL_LO_U16_e32:

  case AMDGPU::V_ADD_F16_e64:

  case AMDGPU::V_ADD_F16_e32:

  case AMDGPU::V_SUB_F16_e64:

  case AMDGPU::V_SUB_F16_e32:

  case AMDGPU::V_SUBREV_F16_e64:

  case AMDGPU::V_SUBREV_F16_e32:

  case AMDGPU::V_MUL_F16_e64:

  case AMDGPU::V_MUL_F16_e32:

  case AMDGPU::V_MAX_F16_e64:

  case AMDGPU::V_MAX_F16_e32:

  case AMDGPU::V_MIN_F16_e64:

  case AMDGPU::V_MIN_F16_e32:

  case AMDGPU::V_MAX_U16_e64:

  case AMDGPU::V_MAX_U16_e32:

  case AMDGPU::V_MIN_U16_e64:

  case AMDGPU::V_MIN_U16_e32:

  case AMDGPU::V_MAX_I16_e64:

  case AMDGPU::V_MAX_I16_e32:

  case AMDGPU::V_MIN_I16_e64:

  case AMDGPU::V_MIN_I16_e32:

  case AMDGPU::V_MAD_F16_e64:

  case AMDGPU::V_MAD_U16_e64:

  case AMDGPU::V_MAD_I16_e64:

  case AMDGPU::V_FMA_F16_e64:

  case AMDGPU::V_DIV_FIXUP_F16_e64:

    // On gfx10, all 16-bit instructions preserve the high bits.

    return getGeneration() <= AMDGPUSubtarget::GFX9;

  case AMDGPU::V_MADAK_F16:

  case AMDGPU::V_MADMK_F16:

  case AMDGPU::V_MAC_F16_e64:

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_FMAMK_F16:

  case AMDGPU::V_FMAAK_F16:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_e32:

    // In gfx9, the preferred handling of the unused high 16-bits changed. Most

    // instructions maintain the legacy behavior of 0ing. Some instructions

    // changed to preserving the high bits.

    return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;

  case AMDGPU::V_MAD_MIXLO_F16:

  case AMDGPU::V_MAD_MIXHI_F16:

  default:

    return false;

  }

}


void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,

                                       const SchedRegion &Region) const {

  // Track register pressure so the scheduler can try to decrease

  // pressure once register usage is above the threshold defined by

  // SIRegisterInfo::getRegPressureSetLimit()

  Policy.ShouldTrackPressure = true;


  // Enabling both top down and bottom up scheduling seems to give us less

  // register spills than just using one of these approaches on its own.

  Policy.OnlyTopDown = false;

  Policy.OnlyBottomUp = false;


  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.

  if (!enableSIScheduler())

    Policy.ShouldTrackLaneMasks = true;

}


void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,

                                             const SchedRegion &Region) const {

  const Function &F = Region.RegionBegin->getMF()->getFunction();

  Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");

  if (!PostRADirectionAttr.isValid())

    return;


  StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();

  if (PostRADirectionStr == "topdown") {

    Policy.OnlyTopDown = true;

    Policy.OnlyBottomUp = false;

  } else if (PostRADirectionStr == "bottomup") {

    Policy.OnlyTopDown = false;

    Policy.OnlyBottomUp = true;

  } else if (PostRADirectionStr == "bidirectional") {

    Policy.OnlyTopDown = false;

    Policy.OnlyBottomUp = false;

  } else {

    DiagnosticInfoOptimizationFailure Diag(

        F, F.getSubprogram(), "invalid value for postRA direction attribute");

    F.getContext().diagnose(Diag);

  }


  LLVM_DEBUG({

    const char *DirStr = "default";

    if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)

      DirStr = "topdown";

    else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)

      DirStr = "bottomup";

    else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)

      DirStr = "bidirectional";


    dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr

           << '\n';

  });

}


void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {

  if (isWave32()) {

    // Fix implicit $vcc operands after MIParser has verified that they match

    // the instruction definitions.

    for (auto &MBB : MF) {

      for (auto &MI : MBB)

        InstrInfo.fixImplicitOperands(MI);

    }

  }

}


bool GCNSubtarget::hasMadF16() const {

  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;

}


bool GCNSubtarget::useVGPRIndexMode() const {

  return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);

}


bool GCNSubtarget::useAA() const { return UseAA; }


unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {

  return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),

                                                   getGeneration());

}


unsigned

GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,

                                       unsigned DynamicVGPRBlockSize) const {

  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,

                                                       DynamicVGPRBlockSize);

}


unsigned

GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX10)

    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.


  if (HasFlatScratch || HasArchitectedFlatScratch) {

    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).

    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)

      return 4; // FLAT_SCRATCH, VCC (in that order).

  }


  if (isXNACKEnabled())

    return 4; // XNACK, VCC (in that order).

  return 2;   // VCC.

}


unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());

}


unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {

  // In principle we do not need to reserve SGPR pair used for flat_scratch if

  // we know flat instructions do not access the stack anywhere in the

  // program. For now assume it's needed if we have flat instructions.

  const bool KernelUsesFlatScratch = hasFlatAddressSpace();

  return getBaseReservedNumSGPRs(KernelUsesFlatScratch);

}


std::pair<unsigned, unsigned>

GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,

                               unsigned NumSGPRs, unsigned NumVGPRs) const {

  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);

  // Temporarily check both the attribute and the subtarget feature until the

  // latter is removed.

  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())

    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();


  auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);

  unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);

  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);


  // Maximum occupancy may be further limited by high SGPR/VGPR usage.

  MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));

  return {std::min(MinOcc, MaxOcc), MaxOcc};

}


unsigned GCNSubtarget::getBaseMaxNumSGPRs(

    const Function &F, std::pair<unsigned, unsigned> WavesPerEU,

    unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {

  // Compute maximum number of SGPRs function can use using default/requested

  // minimum number of waves per execution unit.

  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);

  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);


  // Check if maximum number of SGPRs was explicitly requested using

  // "amdgpu-num-sgpr" attribute.

  unsigned Requested =

      F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);


  if (Requested != MaxNumSGPRs) {

    // Make sure requested value does not violate subtarget's specifications.

    if (Requested && (Requested <= ReservedNumSGPRs))

      Requested = 0;


    // If more SGPRs are required to support the input user/system SGPRs,

    // increase to accommodate them.

    //

    // FIXME: This really ends up using the requested number of SGPRs + number

    // of reserved special registers in total. Theoretically you could re-use

    // the last input registers for these special registers, but this would

    // require a lot of complexity to deal with the weird aliasing.

    unsigned InputNumSGPRs = PreloadedSGPRs;

    if (Requested && Requested < InputNumSGPRs)

      Requested = InputNumSGPRs;


    // Make sure requested value is compatible with values implied by

    // default/requested minimum/maximum number of waves per execution unit.

    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))

      Requested = 0;

    if (WavesPerEU.second && Requested &&

        Requested < getMinNumSGPRs(WavesPerEU.second))

      Requested = 0;


    if (Requested)

      MaxNumSGPRs = Requested;

  }


  if (hasSGPRInitBug())

    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;


  return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);

}


unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {

  const Function &F = MF.getFunction();

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),

                            getReservedNumSGPRs(MF));

}


unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {

  using USI = GCNUserSGPRUsageInfo;

  // Max number of user SGPRs

  const unsigned MaxUserSGPRs =

      USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +

      USI::getNumUserSGPRForField(USI::DispatchPtrID) +

      USI::getNumUserSGPRForField(USI::QueuePtrID) +

      USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +

      USI::getNumUserSGPRForField(USI::DispatchIdID) +

      USI::getNumUserSGPRForField(USI::FlatScratchInitID) +

      USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);


  // Max number of system SGPRs

  const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX

                                  1 + // WorkGroupIDY

                                  1 + // WorkGroupIDZ

                                  1 + // WorkGroupInfo

                                  1;  // private segment wave byte offset


  // Max number of synthetic SGPRs

  const unsigned SyntheticSGPRs = 1; // LDSKernelId


  return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;

}


unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {

  return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),

                            getReservedNumSGPRs(F));

}


unsigned GCNSubtarget::getBaseMaxNumVGPRs(

    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {

  const auto &[Min, Max] = NumVGPRBounds;


  // Check if maximum number of VGPRs was explicitly requested using

  // "amdgpu-num-vgpr" attribute.


  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);

  if (Requested != Max && hasGFX90AInsts())

    Requested *= 2;


  // Make sure requested value is inside the range of possible VGPR usage.

  return std::clamp(Requested, Min, Max);

}


unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {

  // Temporarily check both the attribute and the subtarget feature, until the

  // latter is removed.

  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);

  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())

    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();


  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);

  return getBaseMaxNumVGPRs(

      F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),

          getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});

}


unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

  return getMaxNumVGPRs(MF.getFunction());

}


std::pair<unsigned, unsigned>

GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {

  const unsigned MaxVectorRegs = getMaxNumVGPRs(F);


  unsigned MaxNumVGPRs = MaxVectorRegs;

  unsigned MaxNumAGPRs = 0;


  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,

  // a wave may have up to 512 total vector registers combining together both

  // VGPRs and AGPRs. Hence, in an entry function without calls and without

  // AGPRs used within it, it is possible to use the whole vector register

  // budget for VGPRs.

  //

  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split

  //       register file accordingly.

  if (hasGFX90AInsts()) {

    unsigned MinNumAGPRs = 0;

    const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();

    const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();


    const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};


    // TODO: The lower bound should probably force the number of required

    // registers up, overriding amdgpu-waves-per-eu.

    std::tie(MinNumAGPRs, MaxNumAGPRs) =

        AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,

                                        /*OnlyFirstRequired=*/true);


    if (MinNumAGPRs == DefaultNumAGPR.first) {

      // Default to splitting half the registers if AGPRs are required.

      MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;

    } else {

      // Align to accum_offset's allocation granularity.

      MinNumAGPRs = alignTo(MinNumAGPRs, 4);


      MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);

    }


    // Clamp values to be inbounds of our limits, and ensure min <= max.


    MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);

    MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);


    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);

    MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);


    assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&

           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&

           "invalid register counts");

  } else if (hasMAIInsts()) {

    // On gfx908 the number of AGPRs always equals the number of VGPRs.

    MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;

  }


  return std::pair(MaxNumVGPRs, MaxNumAGPRs);

}


void GCNSubtarget::adjustSchedDependency(

    SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,

    const TargetSchedModel *SchedModel) const {

  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||

      !Use->isInstr())

    return;


  MachineInstr *DefI = Def->getInstr();

  MachineInstr *UseI = Use->getInstr();


  if (DefI->isBundle()) {

    const SIRegisterInfo *TRI = getRegisterInfo();

    auto Reg = Dep.getReg();

    MachineBasicBlock::const_instr_iterator I(DefI->getIterator());

    MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());

    unsigned Lat = 0;

    for (++I; I != E && I->isBundledWithPred(); ++I) {

      if (I->modifiesRegister(Reg, TRI))

        Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);

      else if (Lat)

        --Lat;

    }

    Dep.setLatency(Lat);

  } else if (UseI->isBundle()) {

    const SIRegisterInfo *TRI = getRegisterInfo();

    auto Reg = Dep.getReg();

    MachineBasicBlock::const_instr_iterator I(UseI->getIterator());

    MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());

    unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);

    for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {

      if (I->readsRegister(Reg, TRI))

        break;

      --Lat;

    }

    Dep.setLatency(Lat);

  } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {

    // Work around the fact that SIInstrInfo::fixImplicitOperands modifies

    // implicit operands which come from the MCInstrDesc, which can fool

    // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit

    // pseudo operands.

    Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(

        DefI, DefOpIdx, UseI, UseOpIdx));

  }

}


unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX12)

    return 0; // Not MIMG encoding.


  if (NSAThreshold.getNumOccurrences() > 0)

    return std::max(NSAThreshold.getValue(), 2u);


  int Value = MF.getFunction().getFnAttributeAsParsedInteger(

      "amdgpu-nsa-threshold", -1);

  if (Value > 0)

    return std::max(Value, 2);


  return NSAThreshold;

}


GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,

                                           const GCNSubtarget &ST)

    : ST(ST) {

  const CallingConv::ID CC = F.getCallingConv();

  const bool IsKernel =

      CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;


  if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))

    KernargSegmentPtr = true;


  bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);

  if (IsAmdHsaOrMesa && !ST.enableFlatScratch())

    PrivateSegmentBuffer = true;

  else if (ST.isMesaGfxShader(F))

    ImplicitBufferPtr = true;


  if (!AMDGPU::isGraphics(CC)) {

    if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))

      DispatchPtr = true;


    // FIXME: Can this always be disabled with < COv5?

    if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))

      QueuePtr = true;


    if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))

      DispatchID = true;

  }


  if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&

      (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&

      // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()

      // is false.

      (ST.enableFlatScratch() ||

       (!AMDGPU::isGraphics(CC) &&

        !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&

      !ST.flatScratchIsArchitected()) {

    FlatScratchInit = true;

  }


  if (hasImplicitBufferPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);


  if (hasPrivateSegmentBuffer())

    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);


  if (hasDispatchPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);


  if (hasQueuePtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);


  if (hasKernargSegmentPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);


  if (hasDispatchID())

    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);


  if (hasFlatScratchInit())

    NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);


  if (hasPrivateSegmentSize())

    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);

}


void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {

  assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));

  NumKernargPreloadSGPRs += NumSGPRs;

  NumUsedUserSGPRs += NumSGPRs;

}


unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {

  return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;

}

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

UseAA
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))

AMDGPUBaseInfo.h

AMDGPUCallLowering.h
This file describes how to lower LLVM calls to machine code calls.

AMDGPUInstructionSelector.h
This file declares the targeting of the InstructionSelector class for AMDGPU.

AMDGPULegalizerInfo.h
This file declares the targeting of the Machinelegalizer class for AMDGPU.

AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.

AMDGPUSelectionDAGInfo.h

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

MBB
MachineBasicBlock & MBB
Definition: ARMSLSHardening.cpp:71

DiagnosticInfo.h

NSAThreshold
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)

EnableVGPRIndexMode
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))

UseAA
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:110

InlineAsmLowering.h
This file describes how to lower LLVM inline asm to machine code INLINEASM.

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

MDBuilder.h

MachineScheduler.h

TRI
Register const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:2118

if
if(PassOpts->AAPipeline)
Definition: PassBuilderBindings.cpp:64

SIMachineFunctionInfo.h

SmallString.h
This file defines the SmallString class.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition: Debug.h:119

TargetFrameLowering.h

AMDGPUGenSubtargetInfo

llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:30

llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition: AMDGPUSubtarget.h:74

llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition: AMDGPUSubtarget.h:82

llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition: AMDGPUSubtarget.cpp:209

llvm::AMDGPUSubtarget::getOccupancyWithWorkGroupSizes
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
Definition: AMDGPUSubtarget.h:169

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:42

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition: AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition: AMDGPUSubtarget.h:44

llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition: AMDGPUSubtarget.h:33

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:38

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::EUsPerCU
unsigned EUsPerCU
Definition: AMDGPUSubtarget.h:78

llvm::AMDGPUSubtarget::getWavefrontSizeLog2
unsigned getWavefrontSizeLog2() const
Definition: AMDGPUSubtarget.h:311

llvm::AMDGPUSubtarget::HasSMulHi
bool HasSMulHi
Definition: AMDGPUSubtarget.h:72

llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition: AMDGPUSubtarget.h:80

llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition: AMDGPUSubtarget.h:79

llvm::AMDGPUSubtarget::AddressableLocalMemorySize
unsigned AddressableLocalMemorySize
Definition: AMDGPUSubtarget.h:81

llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition: AMDGPUSubtarget.h:188

llvm::AMDGPU::IsaInfo::AMDGPUTargetID::setTargetIDFromFeaturesString
void setTargetIDFromFeaturesString(StringRef FS)
Definition: AMDGPUBaseInfo.cpp:1015

llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getXnackSetting
TargetIDSetting getXnackSetting() const
Definition: AMDGPUBaseInfo.h:182

llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getSramEccSetting
TargetIDSetting getSramEccSetting() const
Definition: AMDGPUBaseInfo.h:209

llvm::Attribute
Definition: Attributes.h:69

llvm::Attribute::getValueAsString
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:400

llvm::Attribute::isValid
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:223

llvm::DiagnosticInfoOptimizationFailure
Diagnostic information for optimization failures.
Definition: DiagnosticInfo.h:1069

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1101

llvm::Function
Definition: Function.h:64

llvm::Function::getFnAttributeAsParsedInteger
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:774

llvm::GCNSubtarget
Definition: GCNSubtarget.h:34

llvm::GCNSubtarget::hasFlat
bool hasFlat() const
Definition: GCNSubtarget.h:428

llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: GCNSubtarget.cpp:392

llvm::GCNSubtarget::mirFileLoaded
void mirFileLoaded(MachineFunction &MF) const override
Definition: GCNSubtarget.cpp:377

llvm::GCNSubtarget::MaxPrivateElementSize
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:68

llvm::GCNSubtarget::getMinNumSGPRs
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1613

llvm::GCNSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)

llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition: GCNSubtarget.h:1317

llvm::GCNSubtarget::hasMAIInsts
bool hasMAIInsts() const
Definition: GCNSubtarget.h:878

llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition: GCNSubtarget.cpp:197

llvm::GCNSubtarget::getInstrItineraryData
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:350

llvm::GCNSubtarget::adjustSchedDependency
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
Definition: GCNSubtarget.cpp:631

llvm::GCNSubtarget::overridePostRASchedPolicy
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Definition: GCNSubtarget.cpp:340

llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition: GCNSubtarget.cpp:388

llvm::GCNSubtarget::hasSGPRInitBug
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:1215

llvm::GCNSubtarget::getMinNumVGPRs
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
Definition: GCNSubtarget.h:1692

llvm::GCNSubtarget::isDynamicVGPREnabled
bool isDynamicVGPREnabled() const
Definition: GCNSubtarget.h:1813

llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:320

llvm::GCNSubtarget::getBaseMaxNumVGPRs
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
Definition: GCNSubtarget.cpp:542

llvm::GCNSubtarget::LDSBankCount
int LDSBankCount
Definition: GCNSubtarget.h:67

llvm::GCNSubtarget::zeroesHigh16BitsOfDest
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Definition: GCNSubtarget.cpp:225

llvm::GCNSubtarget::getBaseMaxNumSGPRs
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
Definition: GCNSubtarget.cpp:458

llvm::GCNSubtarget::getMaxNumPreloadedSGPRs
unsigned getMaxNumPreloadedSGPRs() const
Definition: GCNSubtarget.cpp:512

llvm::GCNSubtarget::initializeSubtargetDependencies
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition: GCNSubtarget.cpp:57

llvm::GCNSubtarget::Gen
unsigned Gen
Definition: GCNSubtarget.h:65

llvm::GCNSubtarget::overrideSchedPolicy
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Definition: GCNSubtarget.cpp:323

llvm::GCNSubtarget::computeOccupancy
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
Definition: GCNSubtarget.cpp:441

llvm::GCNSubtarget::getMaxNumVGPRs
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
Definition: GCNSubtarget.h:1700

llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:316

llvm::GCNSubtarget::getNSAThreshold
unsigned getNSAThreshold(const MachineFunction &MF) const
Definition: GCNSubtarget.cpp:676

llvm::GCNSubtarget::hasFlatAddressSpace
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:670

llvm::GCNSubtarget::getReservedNumSGPRs
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
Definition: GCNSubtarget.cpp:427

llvm::GCNSubtarget::hasMovrel
bool hasMovrel() const
Definition: GCNSubtarget.h:1071

llvm::GCNSubtarget::useAA
bool useAA() const override
Definition: GCNSubtarget.cpp:396

llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1744

llvm::GCNSubtarget::hasVGPRIndexMode
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:1075

llvm::GCNSubtarget::getOccupancyWithNumVGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
Definition: GCNSubtarget.cpp:404

llvm::GCNSubtarget::HasArchitectedFlatScratch
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:222

llvm::GCNSubtarget::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Definition: GCNSubtarget.cpp:398

llvm::GCNSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:379

llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:356

llvm::GCNSubtarget::GCNSubtarget
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
Definition: GCNSubtarget.cpp:167

llvm::GCNSubtarget::getMaxNumSGPRs
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
Definition: GCNSubtarget.h:1619

llvm::GCNSubtarget::getMaxNumVectorRegs
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
Definition: GCNSubtarget.cpp:575

llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:656

llvm::GCNSubtarget::FlatForGlobal
bool FlatForGlobal
Definition: GCNSubtarget.h:76

llvm::GCNSubtarget::getBaseReservedNumSGPRs
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
Definition: GCNSubtarget.cpp:411

llvm::GCNSubtarget::enableSIScheduler
bool enableSIScheduler() const
Definition: GCNSubtarget.h:1207

llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:424

llvm::GCNSubtarget::getDynamicVGPRBlockSize
unsigned getDynamicVGPRBlockSize() const
Definition: GCNSubtarget.h:1814

llvm::GCNSubtarget::hasFP64
bool hasFP64() const
Definition: GCNSubtarget.h:404

llvm::GCNSubtarget::checkSubtargetFeatures
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Definition: GCNSubtarget.cpp:158

llvm::GCNSubtarget::~GCNSubtarget
~GCNSubtarget() override

llvm::GCNSubtarget::getSelectionDAGInfo
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
Definition: GCNSubtarget.cpp:193

llvm::GCNSubtarget::TargetID
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:64

llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:81

llvm::GCNUserSGPRUsageInfo
Definition: GCNSubtarget.h:1837

llvm::GCNUserSGPRUsageInfo::getNumUserSGPRForField
static unsigned getNumUserSGPRForField(UserSGPRID ID)
Definition: GCNSubtarget.h:1875

llvm::GCNUserSGPRUsageInfo::hasQueuePtr
bool hasQueuePtr() const
Definition: GCNSubtarget.h:1845

llvm::GCNUserSGPRUsageInfo::hasKernargSegmentPtr
bool hasKernargSegmentPtr() const
Definition: GCNSubtarget.h:1847

llvm::GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
Definition: GCNSubtarget.cpp:755

llvm::GCNUserSGPRUsageInfo::hasDispatchID
bool hasDispatchID() const
Definition: GCNSubtarget.h:1849

llvm::GCNUserSGPRUsageInfo::ImplicitBufferPtrID
@ ImplicitBufferPtrID
Definition: GCNSubtarget.h:1864

llvm::GCNUserSGPRUsageInfo::DispatchIdID
@ DispatchIdID
Definition: GCNSubtarget.h:1869

llvm::GCNUserSGPRUsageInfo::QueuePtrID
@ QueuePtrID
Definition: GCNSubtarget.h:1867

llvm::GCNUserSGPRUsageInfo::DispatchPtrID
@ DispatchPtrID
Definition: GCNSubtarget.h:1866

llvm::GCNUserSGPRUsageInfo::FlatScratchInitID
@ FlatScratchInitID
Definition: GCNSubtarget.h:1870

llvm::GCNUserSGPRUsageInfo::PrivateSegmentBufferID
@ PrivateSegmentBufferID
Definition: GCNSubtarget.h:1865

llvm::GCNUserSGPRUsageInfo::PrivateSegmentSizeID
@ PrivateSegmentSizeID
Definition: GCNSubtarget.h:1871

llvm::GCNUserSGPRUsageInfo::KernargSegmentPtrID
@ KernargSegmentPtrID
Definition: GCNSubtarget.h:1868

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentBuffer
bool hasPrivateSegmentBuffer() const
Definition: GCNSubtarget.h:1841

llvm::GCNUserSGPRUsageInfo::getNumFreeUserSGPRs
unsigned getNumFreeUserSGPRs()
Definition: GCNSubtarget.cpp:761

llvm::GCNUserSGPRUsageInfo::hasImplicitBufferPtr
bool hasImplicitBufferPtr() const
Definition: GCNSubtarget.h:1839

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentSize
bool hasPrivateSegmentSize() const
Definition: GCNSubtarget.h:1853

llvm::GCNUserSGPRUsageInfo::hasDispatchPtr
bool hasDispatchPtr() const
Definition: GCNSubtarget.h:1843

llvm::GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Definition: GCNSubtarget.cpp:691

llvm::GCNUserSGPRUsageInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition: GCNSubtarget.h:1851

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition: LLVMContext.cpp:245

llvm::Legalizer
Definition: Legalizer.h:38

llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:363

llvm::MachineBasicBlock::const_instr_iterator
Instructions::const_iterator const_instr_iterator
Definition: MachineBasicBlock.h:337

llvm::MachineFunction
Definition: MachineFunction.h:286

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:733

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:860

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:72

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359

llvm::MachineInstr::isBundle
bool isBundle() const
Definition: MachineInstr.h:1427

llvm::Region
Definition: RegionInfo.h:887

llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:51

llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:513

llvm::SDep::Data
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:55

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147

llvm::SDep::getLatency
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142

llvm::SDep::getReg
Register getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:216

llvm::SIInstrInfo::getSchedModel
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1542

llvm::SIInstrInfo::getInstrLatency
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
Definition: SIInstrInfo.cpp:10224

llvm::SIInstrInfo::fixImplicitOperands
void fixImplicitOperands(MachineInstr &MI) const
Definition: SIInstrInfo.cpp:9668

llvm::SIInstrInfo::pseudoToMCOpcode
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
Definition: SIInstrInfo.cpp:9914

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:412

llvm::SIMachineFunctionInfo::getNumPreloadedSGPRs
unsigned getNumPreloadedSGPRs() const
Definition: SIMachineFunctionInfo.h:993

llvm::SIMachineFunctionInfo::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU() const
Definition: SIMachineFunctionInfo.h:1152

llvm::SIMachineFunctionInfo::getUserSGPRInfo
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Definition: SIMachineFunctionInfo.h:691

llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:40

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:249

llvm::SelectionDAGTargetInfo
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
Definition: SelectionDAGTargetInfo.h:33

llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55

llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition: TargetFrameLowering.h:46

llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:31

llvm::TargetSchedModel::computeOperandLatency
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Definition: TargetSchedule.cpp:170

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47

llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:229

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35

llvm::Value
LLVM Value Representation.
Definition: Value.h:75

llvm::cl::opt
Definition: CommandLine.h:1429

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:134

unsigned

llvm::AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize)
Definition: AMDGPUBaseInfo.cpp:1411

llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:1210

llvm::AMDGPU::IsaInfo::getLocalMemorySize
unsigned getLocalMemorySize(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:1145

llvm::AMDGPU::IsaInfo::getEUsPerCU
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:1169

llvm::AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
Definition: AMDGPUBaseInfo.h:146

llvm::AMDGPU::IsaInfo::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
Definition: AMDGPUBaseInfo.cpp:1428

llvm::AMDGPU::getMaxNumUserSGPRs
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:2444

llvm::AMDGPU::isEntryFunctionCC
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.h:1454

llvm::AMDGPU::getDynamicVGPRBlockSize
unsigned getDynamicVGPRBlockSize(const Function &F)
Definition: AMDGPUBaseInfo.cpp:2393

llvm::AMDGPU::getIntegerPairAttribute
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
Definition: AMDGPUBaseInfo.cpp:1600

llvm::AMDGPU::isGraphics
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.h:1443

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200

llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144

llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155

llvm::MachineSchedPolicy
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
Definition: MachineScheduler.h:199

llvm::MachineSchedPolicy::OnlyTopDown
bool OnlyTopDown
Definition: MachineScheduler.h:208

llvm::MachineSchedPolicy::OnlyBottomUp
bool OnlyBottomUp
Definition: MachineScheduler.h:209

llvm::MachineSchedPolicy::ShouldTrackPressure
bool ShouldTrackPressure
Definition: MachineScheduler.h:201

llvm::MachineSchedPolicy::ShouldTrackLaneMasks
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
Definition: MachineScheduler.h:204

llvm::SchedRegion
A region of an MBB for scheduling.
Definition: MachineScheduler.h:222

llvm::cl::desc
Definition: CommandLine.h:410