LLVM: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Source File

//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// \file

// This file implements a TargetTransformInfo analysis pass specific to the

// AMDGPU target machine. It uses the target's detailed information to provide

// more precise answers to certain TTI queries, while letting the target

// independent and default TTI implementations handle the rest.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUTargetTransformInfo.h"

#include "AMDGPUTargetMachine.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIModeRegisterDefaults.h"

#include "llvm/Analysis/InlineCost.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Support/KnownBits.h"

#include <optional>


using namespace llvm;


#define DEBUG_TYPE "AMDGPUtti"


static cl::opt<unsigned> UnrollThresholdPrivate(

  "amdgpu-unroll-threshold-private",

  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),

  cl::init(2700), cl::Hidden);


static cl::opt<unsigned> UnrollThresholdLocal(

  "amdgpu-unroll-threshold-local",

  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),

  cl::init(1000), cl::Hidden);


static cl::opt<unsigned> UnrollThresholdIf(

  "amdgpu-unroll-threshold-if",

  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),

  cl::init(200), cl::Hidden);


static cl::opt<bool> UnrollRuntimeLocal(

  "amdgpu-unroll-runtime-local",

  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),

  cl::init(true), cl::Hidden);


static cl::opt<unsigned> UnrollMaxBlockToAnalyze(

    "amdgpu-unroll-max-block-to-analyze",

    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),

    cl::init(32), cl::Hidden);


static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",

                                       cl::Hidden, cl::init(4000),

                                       cl::desc("Cost of alloca argument"));


// If the amount of scratch memory to eliminate exceeds our ability to allocate

// it into registers we gain nothing by aggressively inlining functions for that

// heuristic.

static cl::opt<unsigned>

    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,

                    cl::init(256),

                    cl::desc("Maximum alloca size to use for inline cost"));


// Inliner constraint to achieve reasonable compilation time.

static cl::opt<size_t> InlineMaxBB(

    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),

    cl::desc("Maximum number of BBs allowed in a function after inlining"

             " (compile time constraint)"));


// This default unroll factor is based on microbenchmarks on gfx1030.

static cl::opt<unsigned> MemcpyLoopUnroll(

    "amdgpu-memcpy-loop-unroll",

    cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "

             "operations when lowering memcpy as a loop"),

    cl::init(16), cl::Hidden);


static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,

                              unsigned Depth = 0) {

  const Instruction *I = dyn_cast<Instruction>(Cond);

  if (!I)

    return false;


  for (const Value *V : I->operand_values()) {

    if (!L->contains(I))

      continue;

    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {

      if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {

                  return SubLoop->contains(PHI); }))

        return true;

    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))

      return true;

  }

  return false;

}


AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

    : BaseT(TM, F.getDataLayout()),

      TargetTriple(TM->getTargetTriple()),

      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),

      TLI(ST->getTargetLowering()) {}


void AMDGPUTTIImpl::getUnrollingPreferences(

    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

    OptimizationRemarkEmitter *ORE) const {

  const Function &F = *L->getHeader()->getParent();

  UP.Threshold =

      F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);

  UP.MaxCount = std::numeric_limits<unsigned>::max();

  UP.Partial = true;


  // Conditional branch in a loop back edge needs 3 additional exec

  // manipulations in average.

  UP.BEInsns += 3;


  // We want to run unroll even for the loops which have been vectorized.

  UP.UnrollVectorizedLoop = true;


  // TODO: Do we want runtime unrolling?


  // Maximum alloca size than can fit registers. Reserve 16 registers.

  const unsigned MaxAlloca = (256 - 16) * 4;

  unsigned ThresholdPrivate = UnrollThresholdPrivate;

  unsigned ThresholdLocal = UnrollThresholdLocal;


  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the

  // provided threshold value as the default for Threshold

  if (MDNode *LoopUnrollThreshold =

          findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {

    if (LoopUnrollThreshold->getNumOperands() == 2) {

      ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(

          LoopUnrollThreshold->getOperand(1));

      if (MetaThresholdValue) {

        // We will also use the supplied value for PartialThreshold for now.

        // We may introduce additional metadata if it becomes necessary in the

        // future.

        UP.Threshold = MetaThresholdValue->getSExtValue();

        UP.PartialThreshold = UP.Threshold;

        ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);

        ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);

      }

    }

  }


  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);

  for (const BasicBlock *BB : L->getBlocks()) {

    const DataLayout &DL = BB->getDataLayout();

    unsigned LocalGEPsSeen = 0;


    if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {

               return SubLoop->contains(BB); }))

        continue; // Block belongs to an inner loop.


    for (const Instruction &I : *BB) {

      // Unroll a loop which contains an "if" statement whose condition

      // defined by a PHI belonging to the loop. This may help to eliminate

      // if region and potentially even PHI itself, saving on both divergence

      // and registers used for the PHI.

      // Add a small bonus for each of such "if" statements.

      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {

        if (UP.Threshold < MaxBoost && Br->isConditional()) {

          BasicBlock *Succ0 = Br->getSuccessor(0);

          BasicBlock *Succ1 = Br->getSuccessor(1);

          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||

              (L->contains(Succ1) && L->isLoopExiting(Succ1)))

            continue;

          if (dependsOnLocalPhi(L, Br->getCondition())) {

            UP.Threshold += UnrollThresholdIf;

            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold

                              << " for loop:\n"

                              << *L << " due to " << *Br << '\n');

            if (UP.Threshold >= MaxBoost)

              return;

          }

        }

        continue;

      }


      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);

      if (!GEP)

        continue;


      unsigned AS = GEP->getAddressSpace();

      unsigned Threshold = 0;

      if (AS == AMDGPUAS::PRIVATE_ADDRESS)

        Threshold = ThresholdPrivate;

      else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)

        Threshold = ThresholdLocal;

      else

        continue;


      if (UP.Threshold >= Threshold)

        continue;


      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {

        const Value *Ptr = GEP->getPointerOperand();

        const AllocaInst *Alloca =

            dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));

        if (!Alloca || !Alloca->isStaticAlloca())

          continue;

        Type *Ty = Alloca->getAllocatedType();

        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;

        if (AllocaSize > MaxAlloca)

          continue;

      } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||

                 AS == AMDGPUAS::REGION_ADDRESS) {

        LocalGEPsSeen++;

        // Inhibit unroll for local memory if we have seen addressing not to

        // a variable, most likely we will be unable to combine it.

        // Do not unroll too deep inner loops for local memory to give a chance

        // to unroll an outer loop for a more important reason.

        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)

          continue;


        const Value *V = getUnderlyingObject(GEP->getPointerOperand());

        if (!isa<GlobalVariable>(V) && !isa<Argument>(V))

          continue;


        LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"

                          << *L << " due to LDS use.\n");

        UP.Runtime = UnrollRuntimeLocal;

      }


      // Check if GEP depends on a value defined by this loop itself.

      bool HasLoopDef = false;

      for (const Value *Op : GEP->operands()) {

        const Instruction *Inst = dyn_cast<Instruction>(Op);

        if (!Inst || L->isLoopInvariant(Op))

          continue;


        if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {

             return SubLoop->contains(Inst); }))

          continue;

        HasLoopDef = true;

        break;

      }

      if (!HasLoopDef)

        continue;


      // We want to do whatever we can to limit the number of alloca

      // instructions that make it through to the code generator.  allocas

      // require us to use indirect addressing, which is slow and prone to

      // compiler bugs.  If this loop does an address calculation on an

      // alloca ptr, then we want to use a higher than normal loop unroll

      // threshold. This will give SROA a better chance to eliminate these

      // allocas.

      //

      // We also want to have more unrolling for local memory to let ds

      // instructions with different offsets combine.

      //

      // Don't use the maximum allowed value here as it will make some

      // programs way too big.

      UP.Threshold = Threshold;

      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold

                        << " for loop:\n"

                        << *L << " due to " << *GEP << '\n');

      if (UP.Threshold >= MaxBoost)

        return;

    }


    // If we got a GEP in a small BB from inner loop then increase max trip

    // count to analyze for better estimation cost in unroll

    if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)

      UP.MaxIterationsCountToAnalyze = 32;

  }

}


void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                          TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {

  return 1024;

}


const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {

    // Codegen control options which don't matter.

    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,

    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,

    AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,

    AMDGPU::FeatureUnalignedAccessMode,


    AMDGPU::FeatureAutoWaitcntBeforeBarrier,


    // Property of the kernel/environment which can't actually differ.

    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,

    AMDGPU::FeatureTrapHandler,


    // The default assumption needs to be ecc is enabled, but no directly

    // exposed operations depend on it, so it can be safely inlined.

    AMDGPU::FeatureSRAMECC,


    // Perf-tuning features

    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};


GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

    : BaseT(TM, F.getDataLayout()),

      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),

      TLI(ST->getTargetLowering()), CommonTTI(TM, F),

      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {

  SIModeRegisterDefaults Mode(F, *ST);

  HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();

  HasFP64FP16Denormals =

      Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();

}


bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {

  return !F || !ST->isSingleLaneExecution(*F);

}


unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {

  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector

  // registers. See getRegisterClassForType for the implementation.

  // In this case vector registers are not vector in terms of

  // VGPRs, but those which can hold multiple values.


  // This is really the number of registers to fill when vectorizing /

  // interleaving loops, so we lie to avoid trying to use all registers.

  return 4;

}


TypeSize


GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {

  switch (K) {

  case TargetTransformInfo::RGK_Scalar:

    return TypeSize::getFixed(32);

  case TargetTransformInfo::RGK_FixedWidthVector:

    return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);

  case TargetTransformInfo::RGK_ScalableVector:

    return TypeSize::getScalable(0);

  }

  llvm_unreachable("Unsupported register kind");

}


unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {

  return 32;

}


unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

  if (Opcode == Instruction::Load || Opcode == Instruction::Store)

    return 32 * 4 / ElemWidth;

  // For a given width return the max 0number of elements that can be combined

  // into a wider bit value:

  return (ElemWidth == 8 && ST->has16BitInsts())       ? 4

         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2

         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2

                                                       : 1;

}


unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,

                                         unsigned ChainSizeInBytes,

                                         VectorType *VecTy) const {

  unsigned VecRegBitWidth = VF * LoadSize;

  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)

    // TODO: Support element-size less than 32bit?

    return 128 / LoadSize;


  return VF;

}


unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,

                                             unsigned ChainSizeInBytes,

                                             VectorType *VecTy) const {

  unsigned VecRegBitWidth = VF * StoreSize;

  if (VecRegBitWidth > 128)

    return 128 / StoreSize;


  return VF;

}


unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {

  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||

      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||

      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

      AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||

      AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||

      AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {

    return 512;

  }


  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)

    return 8 * ST->getMaxPrivateElementSize();


  // Common to flat, global, local and region. Assume for unknown addrspace.

  return 128;

}


bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,

                                            Align Alignment,

                                            unsigned AddrSpace) const {

  // We allow vectorization of flat stores, even though we may need to decompose

  // them later if they may access private memory. We don't have enough context

  // here, and legalization can handle it.

  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {

    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&

           ChainSizeInBytes <= ST->getMaxPrivateElementSize();

  }

  return true;

}


bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,

                                             Align Alignment,

                                             unsigned AddrSpace) const {

  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);

}


bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,

                                              Align Alignment,

                                              unsigned AddrSpace) const {

  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);

}


uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {

  return 1024;

}


Type *GCNTTIImpl::getMemcpyLoopLoweringType(

    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,

    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,

    std::optional<uint32_t> AtomicElementSize) const {


  if (AtomicElementSize)

    return Type::getIntNTy(Context, *AtomicElementSize * 8);


  // 16-byte accesses achieve the highest copy throughput.

  // If the operation has a fixed known length that is large enough, it is

  // worthwhile to return an even wider type and let legalization lower it into

  // multiple accesses, effectively unrolling the memcpy loop.

  // We also rely on legalization to decompose into smaller accesses for

  // subtargets and address spaces where it is necessary.

  //

  // Don't unroll if Length is not a constant, since unrolling leads to worse

  // performance for length values that are smaller or slightly larger than the

  // total size of the type returned here. Mitigating that would require a more

  // complex lowering for variable-length memcpy and memmove.

  unsigned I32EltsInVector = 4;

  if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))

    return FixedVectorType::get(Type::getInt32Ty(Context),

                                MemcpyLoopUnroll * I32EltsInVector);


  return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);

}


void GCNTTIImpl::getMemcpyLoopResidualLoweringType(

    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,

    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,

    Align SrcAlign, Align DestAlign,

    std::optional<uint32_t> AtomicCpySize) const {


  if (AtomicCpySize)

    BaseT::getMemcpyLoopResidualLoweringType(

        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,

        DestAlign, AtomicCpySize);


  Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);

  while (RemainingBytes >= 16) {

    OpsOut.push_back(I32x4Ty);

    RemainingBytes -= 16;

  }


  Type *I64Ty = Type::getInt64Ty(Context);

  while (RemainingBytes >= 8) {

    OpsOut.push_back(I64Ty);

    RemainingBytes -= 8;

  }


  Type *I32Ty = Type::getInt32Ty(Context);

  while (RemainingBytes >= 4) {

    OpsOut.push_back(I32Ty);

    RemainingBytes -= 4;

  }


  Type *I16Ty = Type::getInt16Ty(Context);

  while (RemainingBytes >= 2) {

    OpsOut.push_back(I16Ty);

    RemainingBytes -= 2;

  }


  Type *I8Ty = Type::getInt8Ty(Context);

  while (RemainingBytes) {

    OpsOut.push_back(I8Ty);

    --RemainingBytes;

  }

}


unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {

  // Disable unrolling if the loop is not vectorized.

  // TODO: Enable this again.

  if (VF.isScalar())

    return 1;


  return 8;

}


bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,

                                       MemIntrinsicInfo &Info) const {

  switch (Inst->getIntrinsicID()) {

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap: {

    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));

    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));

    if (!Ordering || !Volatile)

      return false; // Invalid.


    unsigned OrderingVal = Ordering->getZExtValue();

    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))

      return false;


    Info.PtrVal = Inst->getArgOperand(0);

    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);

    Info.ReadMem = true;

    Info.WriteMem = true;

    Info.IsVolatile = !Volatile->isZero();

    return true;

  }

  default:

    return false;

  }

}


InstructionCost GCNTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  // Because we don't have any legal vector operations, but the legal types, we

  // need to account for split vectors.

  unsigned NElts = LT.second.isVector() ?

    LT.second.getVectorNumElements() : 1;


  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;


  switch (ISD) {

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    if (SLT == MVT::i64)

      return get64BitInstrCost(CostKind) * LT.first * NElts;


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    // i32

    return getFullRateInstrCost() * LT.first * NElts;

  case ISD::ADD:

  case ISD::SUB:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    if (SLT == MVT::i64) {

      // and, or and xor are typically split into 2 VALU instructions.

      return 2 * getFullRateInstrCost() * LT.first * NElts;

    }


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    return LT.first * NElts * getFullRateInstrCost();

  case ISD::MUL: {

    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);

    if (SLT == MVT::i64) {

      const int FullRateCost = getFullRateInstrCost();

      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;

    }


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    // i32

    return QuarterRateCost * NElts * LT.first;

  }

  case ISD::FMUL:

    // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for

    // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole

    // fused operation.

    if (CxtI && CxtI->hasOneUse())

      if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {

        const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());

        if (OPC == ISD::FADD || OPC == ISD::FSUB) {

          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)

            return TargetTransformInfo::TCC_Free;

          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)

            return TargetTransformInfo::TCC_Free;


          // Estimate all types may be fused with contract/unsafe flags

          const TargetOptions &Options = TLI->getTargetMachine().Options;

          if (Options.AllowFPOpFusion == FPOpFusion::Fast ||

              (FAdd->hasAllowContract() && CxtI->hasAllowContract()))

            return TargetTransformInfo::TCC_Free;

        }

      }

    [[fallthrough]];

  case ISD::FADD:

  case ISD::FSUB:

    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)

      NElts = (NElts + 1) / 2;

    if (SLT == MVT::f64)

      return LT.first * NElts * get64BitInstrCost(CostKind);


    if (ST->has16BitInsts() && SLT == MVT::f16)

      NElts = (NElts + 1) / 2;


    if (SLT == MVT::f32 || SLT == MVT::f16)

      return LT.first * NElts * getFullRateInstrCost();

    break;

  case ISD::FDIV:

  case ISD::FREM:

    // FIXME: frem should be handled separately. The fdiv in it is most of it,

    // but the current lowering is also not entirely correct.

    if (SLT == MVT::f64) {

      int Cost = 7 * get64BitInstrCost(CostKind) +

                 getQuarterRateInstrCost(CostKind) +

                 3 * getHalfRateInstrCost(CostKind);

      // Add cost of workaround.

      if (!ST->hasUsableDivScaleConditionOutput())

        Cost += 3 * getFullRateInstrCost();


      return LT.first * Cost * NElts;

    }


    if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {

      // TODO: This is more complicated, unsafe flags etc.

      if ((SLT == MVT::f32 && !HasFP32Denormals) ||

          (SLT == MVT::f16 && ST->has16BitInsts())) {

        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;

      }

    }


    if (SLT == MVT::f16 && ST->has16BitInsts()) {

      // 2 x v_cvt_f32_f16

      // f32 rcp

      // f32 fmul

      // v_cvt_f16_f32

      // f16 div_fixup

      int Cost =

          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);

      return LT.first * Cost * NElts;

    }


    if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {

      // Fast unsafe fdiv lowering:

      // f32 rcp

      // f32 fmul

      int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();

      return LT.first * Cost * NElts;

    }


    if (SLT == MVT::f32 || SLT == MVT::f16) {

      // 4 more v_cvt_* insts without f16 insts support

      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +

                 1 * getQuarterRateInstrCost(CostKind);


      if (!HasFP32Denormals) {

        // FP mode switches.

        Cost += 2 * getFullRateInstrCost();

      }


      return LT.first * NElts * Cost;

    }

    break;

  case ISD::FNEG:

    // Use the backend' estimation. If fneg is not free each element will cost

    // one additional instruction.

    return TLI->isFNegFree(SLT) ? 0 : NElts;

  default:

    break;

  }


  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                       Args, CxtI);

}


// Return true if there's a potential benefit from using v2f16/v2i16

// instructions for an intrinsic, even if it requires nontrivial legalization.


static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {

  switch (ID) {

  case Intrinsic::fma:

  case Intrinsic::fmuladd:

  case Intrinsic::copysign:

  case Intrinsic::minimumnum:

  case Intrinsic::maximumnum:

  case Intrinsic::canonicalize:

  // There's a small benefit to using vector ops in the legalized code.

  case Intrinsic::round:

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

  case Intrinsic::abs:

    return true;

  default:

    return false;

  }

}


InstructionCost


GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

                                  TTI::TargetCostKind CostKind) const {

  switch (ICA.getID()) {

  case Intrinsic::fabs:

    // Free source modifier in the common case.

    return 0;

  case Intrinsic::amdgcn_workitem_id_x:

  case Intrinsic::amdgcn_workitem_id_y:

  case Intrinsic::amdgcn_workitem_id_z:

    // TODO: If hasPackedTID, or if the calling context is not an entry point

    // there may be a bit instruction.

    return 0;

  case Intrinsic::amdgcn_workgroup_id_x:

  case Intrinsic::amdgcn_workgroup_id_y:

  case Intrinsic::amdgcn_workgroup_id_z:

  case Intrinsic::amdgcn_lds_kernel_id:

  case Intrinsic::amdgcn_dispatch_ptr:

  case Intrinsic::amdgcn_dispatch_id:

  case Intrinsic::amdgcn_implicitarg_ptr:

  case Intrinsic::amdgcn_queue_ptr:

    // Read from an argument register.

    return 0;

  default:

    break;

  }


  if (!intrinsicHasPackedVectorBenefit(ICA.getID()))

    return BaseT::getIntrinsicInstrCost(ICA, CostKind);


  Type *RetTy = ICA.getReturnType();


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);


  unsigned NElts = LT.second.isVector() ?

    LT.second.getVectorNumElements() : 1;


  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;


  if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||

      (ST->hasPackedFP32Ops() && SLT == MVT::f32))

    NElts = (NElts + 1) / 2;


  // TODO: Get more refined intrinsic costs?

  unsigned InstRate = getQuarterRateInstrCost(CostKind);


  switch (ICA.getID()) {

  case Intrinsic::fma:

  case Intrinsic::fmuladd:

    if (SLT == MVT::f64) {

      InstRate = get64BitInstrCost(CostKind);

      break;

    }


    if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)

      InstRate = getFullRateInstrCost();

    else {

      InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)

                                     : getQuarterRateInstrCost(CostKind);

    }

    break;

  case Intrinsic::copysign:

    return NElts * getFullRateInstrCost();

  case Intrinsic::minimumnum:

  case Intrinsic::maximumnum: {

    // Instruction + 2 canonicalizes. For cases that need type promotion, we the

    // promotion takes the place of the canonicalize.

    unsigned NumOps = 3;

    if (const IntrinsicInst *II = ICA.getInst()) {

      // Directly legal with ieee=0

      // TODO: Not directly legal with strictfp

      if (fpenvIEEEMode(*II) == KnownIEEEMode::Off)

        NumOps = 1;

    }


    unsigned BaseRate =

        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();

    InstRate = BaseRate * NumOps;

    break;

  }

  case Intrinsic::canonicalize: {

    InstRate =

        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();

    break;

  }

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat: {

    if (SLT == MVT::i16 || SLT == MVT::i32)

      InstRate = getFullRateInstrCost();


    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};

    if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))

      NElts = 1;

    break;

  }

  case Intrinsic::abs:

    // Expansion takes 2 instructions for VALU

    if (SLT == MVT::i16 || SLT == MVT::i32)

      InstRate = 2 * getFullRateInstrCost();

    break;

  default:

    break;

  }


  return LT.first * NElts * InstRate;

}


InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,

                                           TTI::TargetCostKind CostKind,

                                           const Instruction *I) const {

  assert((I == nullptr || I->getOpcode() == Opcode) &&

         "Opcode should reflect passed instruction.");

  const bool SCost =

      (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);

  const int CBrCost = SCost ? 5 : 7;

  switch (Opcode) {

  case Instruction::Br: {

    // Branch instruction takes about 4 slots on gfx900.

    const auto *BI = dyn_cast_or_null<BranchInst>(I);

    if (BI && BI->isUnconditional())

      return SCost ? 1 : 4;

    // Suppose conditional branch takes additional 3 exec manipulations

    // instructions in average.

    return CBrCost;

  }

  case Instruction::Switch: {

    const auto *SI = dyn_cast_or_null<SwitchInst>(I);

    // Each case (including default) takes 1 cmp + 1 cbr instructions in

    // average.

    return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);

  }

  case Instruction::Ret:

    return SCost ? 1 : 10;

  }

  return BaseT::getCFInstrCost(Opcode, CostKind, I);

}


InstructionCost


GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,

                                       std::optional<FastMathFlags> FMF,

                                       TTI::TargetCostKind CostKind) const {

  if (TTI::requiresOrderedReduction(FMF))

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  EVT OrigTy = TLI->getValueType(DL, Ty);


  // Computes cost on targets that have packed math instructions(which support

  // 16-bit types only).

  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  return LT.first * getFullRateInstrCost();

}


InstructionCost


GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

                                   FastMathFlags FMF,

                                   TTI::TargetCostKind CostKind) const {

  EVT OrigTy = TLI->getValueType(DL, Ty);


  // Computes cost on targets that have packed math instructions(which support

  // 16-bit types only).

  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  return LT.first * getHalfRateInstrCost(CostKind);

}


InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,

                                               TTI::TargetCostKind CostKind,

                                               unsigned Index, const Value *Op0,

                                               const Value *Op1) const {

  switch (Opcode) {

  case Instruction::ExtractElement:

  case Instruction::InsertElement: {

    unsigned EltSize

      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());

    if (EltSize < 32) {

      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())

        return 0;

      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,

                                       Op1);

    }


    // Extracts are just reads of a subregister, so are free. Inserts are

    // considered free because we don't want to have any cost for scalarizing

    // operations, and we don't have to copy into a different register class.


    // Dynamic indexing isn't free and is best avoided.

    return Index == ~0u ? 2 : 0;

  }

  default:

    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);

  }

}


/// Analyze if the results of inline asm are divergent. If \p Indices is empty,

/// this is analyzing the collective result of all output registers. Otherwise,

/// this is only querying a specific result index if this returns multiple

/// registers in a struct.


bool GCNTTIImpl::isInlineAsmSourceOfDivergence(

  const CallInst *CI, ArrayRef<unsigned> Indices) const {

  // TODO: Handle complex extract indices

  if (Indices.size() > 1)

    return true;


  const DataLayout &DL = CI->getDataLayout();

  const SIRegisterInfo *TRI = ST->getRegisterInfo();

  TargetLowering::AsmOperandInfoVector TargetConstraints =

      TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);


  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];


  int OutputIdx = 0;

  for (auto &TC : TargetConstraints) {

    if (TC.Type != InlineAsm::isOutput)

      continue;


    // Skip outputs we don't care about.

    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)

      continue;


    TLI->ComputeConstraintToUse(TC, SDValue());


    const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(

        TRI, TC.ConstraintCode, TC.ConstraintVT).second;


    // For AGPR constraints null is returned on subtargets without AGPRs, so

    // assume divergent for null.

    if (!RC || !TRI->isSGPRClass(RC))

      return true;

  }


  return false;

}


bool GCNTTIImpl::isReadRegisterSourceOfDivergence(

    const IntrinsicInst *ReadReg) const {

  Metadata *MD =

      cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();

  StringRef RegName =

      cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();


  // Special case registers that look like VCC.

  MVT VT = MVT::getVT(ReadReg->getType());

  if (VT == MVT::i1)

    return true;


  // Special case scalar registers that start with 'v'.

  if (RegName.starts_with("vcc") || RegName.empty())

    return false;


  // VGPR or AGPR is divergent. There aren't any specially named vector

  // registers.

  return RegName[0] == 'v' || RegName[0] == 'a';

}


/// \returns true if the result of the value could potentially be

/// different across workitems in a wavefront.


bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {

  if (const Argument *A = dyn_cast<Argument>(V))

    return !AMDGPU::isArgPassedInSGPR(A);


  // Loads from the private and flat address spaces are divergent, because

  // threads can execute the load instruction with the same inputs and get

  // different results.

  //

  // All other loads are not divergent, because if threads issue loads with the

  // same arguments, they will always get the same result.

  if (const LoadInst *Load = dyn_cast<LoadInst>(V))

    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||

           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;


  // Atomics are divergent because they are executed sequentially: when an

  // atomic operation refers to the same address in each thread, then each

  // thread after the first sees the value written by the previous thread as

  // original value.

  if (isa<AtomicRMWInst, AtomicCmpXchgInst>(V))

    return true;


  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {

    Intrinsic::ID IID = Intrinsic->getIntrinsicID();

    switch (IID) {

    case Intrinsic::read_register:

      return isReadRegisterSourceOfDivergence(Intrinsic);

    case Intrinsic::amdgcn_addrspacecast_nonnull: {

      unsigned SrcAS =

          Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();

      unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();

      return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&

             DstAS == AMDGPUAS::FLAT_ADDRESS &&

             ST->hasGloballyAddressableScratch();

    }

    case Intrinsic::amdgcn_workitem_id_y:

    case Intrinsic::amdgcn_workitem_id_z: {

      const Function *F = Intrinsic->getFunction();

      bool HasUniformYZ =

          ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);

      std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(

          *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);

      return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);

    }

    default:

      return AMDGPU::isIntrinsicSourceOfDivergence(IID);

    }

  }


  // Assume all function calls are a source of divergence.

  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm())

      return isInlineAsmSourceOfDivergence(CI);

    return true;

  }


  // Assume all function calls are a source of divergence.

  if (isa<InvokeInst>(V))

    return true;


  // If the target supports globally addressable scratch, the mapping from

  // scratch memory to the flat aperture changes therefore an address space cast

  // is no longer uniform.

  if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {

    return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&

           CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&

           ST->hasGloballyAddressableScratch();

  }


  return false;

}


bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {

  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))

    return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());


  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm())

      return !isInlineAsmSourceOfDivergence(CI);

    return false;

  }


  // In most cases TID / wavefrontsize is uniform.

  //

  // However, if a kernel has uneven dimesions we can have a value of

  // workitem-id-x divided by the wavefrontsize non-uniform. For example

  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)

  // packed into a same wave which gives 1 and 0 after the division by 64

  // respectively.

  //

  // The X dimension doesn't reset within a wave if either both the Y

  // and Z dimensions are of length 1, or if the X dimension's required

  // size is a power of 2. Note, however, if the X dimension's maximum

  // size is a power of 2 < the wavefront size, division by the wavefront

  // size is guaranteed to yield 0, so this is also a no-reset case.

  bool XDimDoesntResetWithinWaves = false;

  if (auto *I = dyn_cast<Instruction>(V)) {

    const Function *F = I->getFunction();

    XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);

  }

  using namespace llvm::PatternMatch;

  uint64_t C;

  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                      m_ConstantInt(C))) ||

      match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                      m_ConstantInt(C)))) {

    return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;

  }


  Value *Mask;

  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                       m_Value(Mask)))) {

    return computeKnownBits(Mask, DL).countMinTrailingZeros() >=

               ST->getWavefrontSizeLog2() &&

           XDimDoesntResetWithinWaves;

  }


  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);

  if (!ExtValue)

    return false;


  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));

  if (!CI)

    return false;


  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {

    switch (Intrinsic->getIntrinsicID()) {

    default:

      return false;

    case Intrinsic::amdgcn_if:

    case Intrinsic::amdgcn_else: {

      ArrayRef<unsigned> Indices = ExtValue->getIndices();

      return Indices.size() == 1 && Indices[0] == 1;

    }

    }

  }


  // If we have inline asm returning mixed SGPR and VGPR results, we inferred

  // divergent for the overall struct return. We need to override it in the

  // case we're extracting an SGPR component here.

  if (CI->isInlineAsm())

    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());


  return false;

}


bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,

                                            Intrinsic::ID IID) const {

  switch (IID) {

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private:

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num:

  case Intrinsic::amdgcn_load_to_lds:

  case Intrinsic::amdgcn_make_buffer_rsrc:

    OpIndexes.push_back(0);

    return true;

  default:

    return false;

  }

}


Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,

                                                    Value *OldV,

                                                    Value *NewV) const {

  auto IntrID = II->getIntrinsicID();

  switch (IntrID) {

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private: {

    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?

      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;

    unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    LLVMContext &Ctx = NewV->getType()->getContext();

    ConstantInt *NewVal = (TrueAS == NewAS) ?

      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);

    return NewVal;

  }

  case Intrinsic::ptrmask: {

    unsigned OldAS = OldV->getType()->getPointerAddressSpace();

    unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    Value *MaskOp = II->getArgOperand(1);

    Type *MaskTy = MaskOp->getType();


    bool DoTruncate = false;


    const GCNTargetMachine &TM =

        static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());

    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {

      // All valid 64-bit to 32-bit casts work by chopping off the high

      // bits. Any masking only clearing the low bits will also apply in the new

      // address space.

      if (DL.getPointerSizeInBits(OldAS) != 64 ||

          DL.getPointerSizeInBits(NewAS) != 32)

        return nullptr;


      // TODO: Do we need to thread more context in here?

      KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II);

      if (Known.countMinLeadingOnes() < 32)

        return nullptr;


      DoTruncate = true;

    }


    IRBuilder<> B(II);

    if (DoTruncate) {

      MaskTy = B.getInt32Ty();

      MaskOp = B.CreateTrunc(MaskOp, MaskTy);

    }


    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},

                             {NewV, MaskOp});

  }

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num: {

    Type *DestTy = II->getType();

    Type *SrcTy = NewV->getType();

    unsigned NewAS = SrcTy->getPointerAddressSpace();

    if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))

      return nullptr;

    Module *M = II->getModule();

    Function *NewDecl = Intrinsic::getOrInsertDeclaration(

        M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  case Intrinsic::amdgcn_load_to_lds: {

    Type *SrcTy = NewV->getType();

    Module *M = II->getModule();

    Function *NewDecl =

        Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  case Intrinsic::amdgcn_make_buffer_rsrc: {

    Type *SrcTy = NewV->getType();

    Type *DstTy = II->getType();

    Module *M = II->getModule();

    Function *NewDecl = Intrinsic::getOrInsertDeclaration(

        M, II->getIntrinsicID(), {DstTy, SrcTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  default:

    return nullptr;

  }

}


InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

                                           VectorType *DstTy, VectorType *SrcTy,

                                           ArrayRef<int> Mask,

                                           TTI::TargetCostKind CostKind,

                                           int Index, VectorType *SubTp,

                                           ArrayRef<const Value *> Args,

                                           const Instruction *CxtI) const {

  if (!isa<FixedVectorType>(SrcTy))

    return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,

                                 SubTp);


  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);


  unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());

  if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&

      (ScalarSize == 16 || ScalarSize == 8)) {

    // Larger vector widths may require additional instructions, but are

    // typically cheaper than scalarized versions.

    unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();

    unsigned RequestedElts =

        count_if(Mask, [](int MaskElt) { return MaskElt != -1; });

    unsigned EltsPerReg = 32 / ScalarSize;

    if (RequestedElts == 0)

      return 0;

    switch (Kind) {

    case TTI::SK_Broadcast:

    case TTI::SK_Reverse:

    case TTI::SK_PermuteSingleSrc: {

      // With op_sel VOP3P instructions freely can access the low half or high

      // half of a register, so any swizzle of two elements is free.

      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)

        return 0;

      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;

      // SK_Broadcast just reuses the same mask

      unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;

      return NumPerms + NumPermMasks;

    }

    case TTI::SK_ExtractSubvector:

    case TTI::SK_InsertSubvector: {

      // Even aligned accesses are free

      if (!(Index % 2))

        return 0;

      // Insert/extract subvectors only require shifts / extract code to get the

      // relevant bits

      return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;

    }

    case TTI::SK_PermuteTwoSrc:

    case TTI::SK_Splice:

    case TTI::SK_Select: {

      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;

      // SK_Select just reuses the same mask

      unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;

      return NumPerms + NumPermMasks;

    }


    default:

      break;

    }

  }


  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,

                               SubTp);

}


/// Whether it is profitable to sink the operands of an

/// Instruction I to the basic block of I.

/// This helps using several modifiers (like abs and neg) more often.


bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,

                                            SmallVectorImpl<Use *> &Ops) const {

  using namespace PatternMatch;


  for (auto &Op : I->operands()) {

    // Ensure we are not already sinking this operand.

    if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))

      continue;


    if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))

      Ops.push_back(&Op);

  }


  return !Ops.empty();

}


bool GCNTTIImpl::areInlineCompatible(const Function *Caller,

                                     const Function *Callee) const {

  const TargetMachine &TM = getTLI()->getTargetMachine();

  const GCNSubtarget *CallerST

    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));

  const GCNSubtarget *CalleeST

    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));


  const FeatureBitset &CallerBits = CallerST->getFeatureBits();

  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();


  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;

  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;

  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)

    return false;


  // FIXME: dx10_clamp can just take the caller setting, but there seems to be

  // no way to support merge for backend defined attributes.

  SIModeRegisterDefaults CallerMode(*Caller, *CallerST);

  SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);

  if (!CallerMode.isInlineCompatible(CalleeMode))

    return false;


  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||

      Callee->hasFnAttribute(Attribute::InlineHint))

    return true;


  // Hack to make compile times reasonable.

  if (InlineMaxBB) {

    // Single BB does not increase total BB amount.

    if (Callee->size() == 1)

      return true;

    size_t BBSize = Caller->size() + Callee->size() - 1;

    return BBSize <= InlineMaxBB;

  }


  return true;

}


static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,

                                                   const SITargetLowering *TLI,

                                                   const GCNTTIImpl *TTIImpl) {

  const int NrOfSGPRUntilSpill = 26;

  const int NrOfVGPRUntilSpill = 32;


  const DataLayout &DL = TTIImpl->getDataLayout();


  unsigned adjustThreshold = 0;

  int SGPRsInUse = 0;

  int VGPRsInUse = 0;

  for (const Use &A : CB->args()) {

    SmallVector<EVT, 4> ValueVTs;

    ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);

    for (auto ArgVT : ValueVTs) {

      unsigned CCRegNum = TLI->getNumRegistersForCallingConv(

          CB->getContext(), CB->getCallingConv(), ArgVT);

      if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))

        SGPRsInUse += CCRegNum;

      else

        VGPRsInUse += CCRegNum;

    }

  }


  // The cost of passing function arguments through the stack:

  //  1 instruction to put a function argument on the stack in the caller.

  //  1 instruction to take a function argument from the stack in callee.

  //  1 instruction is explicitly take care of data dependencies in callee

  //  function.

  InstructionCost ArgStackCost(1);

  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

      Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),

      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);

  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

      Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),

      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);


  // The penalty cost is computed relative to the cost of instructions and does

  // not model any storage costs.

  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *

                     ArgStackCost.getValue() * InlineConstants::getInstrCost();

  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *

                     ArgStackCost.getValue() * InlineConstants::getInstrCost();

  return adjustThreshold;

}


static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,

                                           const DataLayout &DL) {

  // If we have a pointer to a private array passed into a function

  // it will not be optimized out, leaving scratch usage.

  // This function calculates the total size in bytes of the memory that would

  // end in scratch if the call was not inlined.

  unsigned AllocaSize = 0;

  SmallPtrSet<const AllocaInst *, 8> AIVisited;

  for (Value *PtrArg : CB->args()) {

    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());

    if (!Ty)

      continue;


    unsigned AddrSpace = Ty->getAddressSpace();

    if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&

        AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)

      continue;


    const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));

    if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)

      continue;


    AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());

  }

  return AllocaSize;

}


int GCNTTIImpl::getInliningLastCallToStaticBonus() const {

  return BaseT::getInliningLastCallToStaticBonus() *

         getInliningThresholdMultiplier();

}


unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {

  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);


  // Private object passed as arguments may end up in scratch usage if the call

  // is not inlined. Increase the inline threshold to promote inlining.

  unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);

  if (AllocaSize > 0)

    Threshold += ArgAllocaCost;

  return Threshold;

}


unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,

                                         const AllocaInst *AI) const {


  // Below the cutoff, assume that the private memory objects would be

  // optimized

  auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);

  if (AllocaSize <= ArgAllocaCutoff)

    return 0;


  // Above the cutoff, we give a cost to each private memory object

  // depending its size. If the array can be optimized by SROA this cost is not

  // added to the total-cost in the inliner cost analysis.

  //

  // We choose the total cost of the alloca such that their sum cancels the

  // bonus given in the threshold (ArgAllocaCost).

  //

  //   Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost

  //

  // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,

  // the single-bb bonus and the vector-bonus.

  //

  // We compensate the first two multipliers, by repeating logic from the

  // inliner-cost in here. The vector-bonus is 0 on AMDGPU.

  static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");

  unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();


  bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {

    return BB.getTerminator()->getNumSuccessors() > 1;

  });

  if (SingleBB) {

    Threshold += Threshold / 2;

  }


  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());


  // Attribute the bonus proportionally to the alloca size

  unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;


  return AllocaThresholdBonus;

}


void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::UnrollingPreferences &UP,

                                         OptimizationRemarkEmitter *ORE) const {

  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);

}


void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                       TTI::PeelingPreferences &PP) const {

  CommonTTI.getPeelingPreferences(L, SE, PP);

}


int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {

  return ST->hasFullRate64Ops()

             ? getFullRateInstrCost()

             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)

                                      : getQuarterRateInstrCost(CostKind);

}


std::pair<InstructionCost, MVT>

GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {

  std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);

  auto Size = DL.getTypeSizeInBits(Ty);

  // Maximum load or store can handle 8 dwords for scalar and 4 for

  // vector ALU. Let's assume anything above 8 dwords is expensive

  // even if legal.

  if (Size <= 256)

    return Cost;


  Cost.first += (Size + 255) / 256;

  return Cost;

}


unsigned GCNTTIImpl::getPrefetchDistance() const {

  return ST->hasPrefetch() ? 128 : 0;

}


bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {

  return AMDGPU::isFlatGlobalAddrSpace(AS);

}


void GCNTTIImpl::collectKernelLaunchBounds(

    const Function &F,

    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {

  SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);

  LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});

  LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});

  LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});

  std::pair<unsigned, unsigned> FlatWorkGroupSize =

      ST->getFlatWorkGroupSizes(F);

  LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});

  LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});

  std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);

  LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});

  LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});

}


GCNTTIImpl::KnownIEEEMode


GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {

  if (!ST->hasIEEEMode()) // Only mode on gfx12

    return KnownIEEEMode::On;


  const Function *F = I.getFunction();

  if (!F)

    return KnownIEEEMode::Unknown;


  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");

  if (IEEEAttr.isValid())

    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;


  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off

                                               : KnownIEEEMode::On;

}


InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

                                            Align Alignment,

                                            unsigned AddressSpace,

                                            TTI::TargetCostKind CostKind,

                                            TTI::OperandValueInfo OpInfo,

                                            const Instruction *I) const {

  if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {

    if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&

        VecTy->getElementType()->isIntegerTy(8)) {

      return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,

                        getLoadStoreVecRegBitWidth(AddressSpace));

    }

  }

  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,

                                OpInfo, I);

}


unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {

  if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {

    if (VecTy->getElementType()->isIntegerTy(8)) {

      unsigned ElementCount = VecTy->getElementCount().getFixedValue();

      return divideCeil(ElementCount - 1, 4);

    }

  }

  return BaseT::getNumberOfParts(Tp);

}


SDValue
return SDValue()

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition AArch64PromoteConstant.cpp:228

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

PHI
Rewrite undef for PHI
Definition AMDGPURewriteUndefForPHI.cpp:98

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)

ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))

dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition AMDGPUTargetTransformInfo.cpp:86

UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)

adjustInliningThresholdUsingCallee
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
Definition AMDGPUTargetTransformInfo.cpp:1344

ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))

InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))

intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition AMDGPUTargetTransformInfo.cpp:688

UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)

getCallArgsTotalAllocaSize
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
Definition AMDGPUTargetTransformInfo.cpp:1390

UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)

MemcpyLoopUnroll
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden)

UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)

AMDGPUTargetTransformInfo.h
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

IRBuilder.h

Function.h

InlineCost.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3450

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

KnownBits.h

RegName
#define RegName(no)

Options
static LVOptions Options
Definition LVOptions.cpp:25

LoopInfo.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2118

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

PatternMatch.h

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:71

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

SIModeRegisterDefaults.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

ValueTracking.h

PointerType
Definition ItaniumDemangle.h:639

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
Definition AMDGPUTargetTransformInfo.cpp:281

llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition AMDGPUTargetTransformInfo.cpp:105

llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition AMDGPUTargetTransformInfo.cpp:276

llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition AMDGPUTargetTransformInfo.cpp:111

llvm::AMDGPUTargetMachine
Definition AMDGPUTargetMachine.h:30

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition Instructions.h:64

llvm::AllocaInst::isStaticAlloca
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition Instructions.cpp:1299

llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition Instructions.h:121

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142

llvm::Attribute
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:69

llvm::Attribute::getValueAsBool
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition Attributes.cpp:386

llvm::Attribute::isValid
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:223

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Definition BasicTTIImpl.h:1423

llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1033

llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3186

llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1366

llvm::BasicTTIImplBase< GCNTTIImpl >::getNumberOfParts
unsigned getNumberOfParts(Type *Tp) const override
Definition BasicTTIImpl.h:3036

llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
Definition BasicTTIImpl.h:1111

llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1167

llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3174

llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:774

llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:997

llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:1734

llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL

llvm::BasicTTIImplBase< GCNTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1503

llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition Instructions.h:3057

llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition InstrTypes.h:1116

llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition InstrTypes.h:1415

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition InstrTypes.h:1348

llvm::CallBase::getCallingConv
CallingConv::ID getCallingConv() const
Definition InstrTypes.h:1406

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1292

llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition InstrTypes.h:1283

llvm::CallBase::getArgOperandNo
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition InstrTypes.h:1323

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1510

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition Constants.cpp:868

llvm::ConstantInt::getFalse
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition Constants.cpp:875

llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::ElementCount
Definition TypeSize.h:298

llvm::ElementCount::isScalar
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320

llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition Instructions.h:2435

llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition Instructions.h:2487

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22

llvm::FeatureBitset
Container class for subtarget features.
Definition SubtargetFeature.h:42

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::Function
Definition Function.h:64

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition GCNSubtarget.h:420

llvm::GCNTTIImpl
Definition AMDGPUTargetTransformInfo.h:63

llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition AMDGPUTargetTransformInfo.cpp:305

llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:380

llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:1222

llvm::GCNTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
Definition AMDGPUTargetTransformInfo.cpp:1547

llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:530

llvm::GCNTTIImpl::collectKernelLaunchBounds
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
Definition AMDGPUTargetTransformInfo.cpp:1514

llvm::GCNTTIImpl::KnownIEEEMode
KnownIEEEMode
Definition AMDGPUTargetTransformInfo.h:285

llvm::GCNTTIImpl::KnownIEEEMode::On
@ On
Definition AMDGPUTargetTransformInfo.h:285

llvm::GCNTTIImpl::KnownIEEEMode::Unknown
@ Unknown
Definition AMDGPUTargetTransformInfo.h:285

llvm::GCNTTIImpl::KnownIEEEMode::Off
@ Off
Definition AMDGPUTargetTransformInfo.h:285

llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:416

llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition AMDGPUTargetTransformInfo.cpp:914

llvm::GCNTTIImpl::isReadRegisterSourceOfDivergence
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
Definition AMDGPUTargetTransformInfo.cpp:950

llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
Definition AMDGPUTargetTransformInfo.cpp:348

llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned RCID) const override
Definition AMDGPUTargetTransformInfo.cpp:320

llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:410

llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
Definition AMDGPUTargetTransformInfo.cpp:370

llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition AMDGPUTargetTransformInfo.cpp:397

llvm::GCNTTIImpl::shouldPrefetchAddressSpace
bool shouldPrefetchAddressSpace(unsigned AS) const override
Definition AMDGPUTargetTransformInfo.cpp:1510

llvm::GCNTTIImpl::hasBranchDivergence
bool hasBranchDivergence(const Function *F=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:316

llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
Definition AMDGPUTargetTransformInfo.cpp:1134

llvm::GCNTTIImpl::getCallerAllocaCost
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
Definition AMDGPUTargetTransformInfo.cpp:1433

llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(ElementCount VF) const override
Definition AMDGPUTargetTransformInfo.cpp:495

llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
Definition AMDGPUTargetTransformInfo.cpp:453

llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition AMDGPUTargetTransformInfo.cpp:850

llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
Definition AMDGPUTargetTransformInfo.cpp:710

llvm::GCNTTIImpl::getInliningThresholdMultiplier
unsigned getInliningThresholdMultiplier() const override
Definition AMDGPUTargetTransformInfo.h:251

llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
Definition AMDGPUTargetTransformInfo.cpp:359

llvm::GCNTTIImpl::getPrefetchDistance
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Definition AMDGPUTargetTransformInfo.cpp:1506

llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:819

llvm::GCNTTIImpl::fpenvIEEEMode
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Definition AMDGPUTargetTransformInfo.cpp:1531

llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const override
Definition AMDGPUTargetTransformInfo.cpp:1422

llvm::GCNTTIImpl::isProfitableToSinkOperands
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
Definition AMDGPUTargetTransformInfo.cpp:1289

llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
Definition AMDGPUTargetTransformInfo.cpp:504

llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const override
Definition AMDGPUTargetTransformInfo.cpp:1044

llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
Definition AMDGPUTargetTransformInfo.cpp:1305

llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
Definition AMDGPUTargetTransformInfo.cpp:868

llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const override
Definition AMDGPUTargetTransformInfo.cpp:973

llvm::GCNTTIImpl::getInliningLastCallToStaticBonus
int getInliningLastCallToStaticBonus() const override
Definition AMDGPUTargetTransformInfo.cpp:1417

llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Definition AMDGPUTargetTransformInfo.cpp:882

llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
Definition AMDGPUTargetTransformInfo.cpp:1118

llvm::GCNTTIImpl::getNumberOfParts
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
Definition AMDGPUTargetTransformInfo.cpp:1564

llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition AMDGPUTargetTransformInfo.cpp:1480

llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const override
Definition AMDGPUTargetTransformInfo.cpp:344

llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
Definition AMDGPUTargetTransformInfo.cpp:332

llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition AMDGPUTargetTransformInfo.cpp:1474

llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
Definition AMDGPUTargetTransformInfo.cpp:426

llvm::GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
Definition AMDGPUTargetTransformInfo.cpp:422

llvm::GCNTargetMachine
Definition AMDGPUTargetMachine.h:81

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition Instructions.h:949

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780

llvm::InlineAsm::isOutput
@ isOutput
Definition InlineAsm.h:99

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::InstructionCost::getValue
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition InstructionCost.h:88

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::hasApproxFunc
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
Definition Instruction.cpp:673

llvm::Instruction::hasAllowContract
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
Definition Instruction.cpp:668

llvm::Instruction::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition Instruction.cpp:86

llvm::IntrinsicCostAttributes
Definition TargetTransformInfo.h:122

llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition TargetTransformInfo.h:157

llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition TargetTransformInfo.h:156

llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:155

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:180

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MDNode
Metadata node.
Definition Metadata.h:1077

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::SimpleValueType
SimpleValueType
Definition MachineValueType.h:38

llvm::MVT::getVT
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:247

llvm::Metadata
Root of the metadata hierarchy.
Definition Metadata.h:63

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::PHINode
Definition Instructions.h:2638

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SITargetLowering
Definition SIISelLowering.h:31

llvm::SITargetLowering::getNumRegistersForCallingConv
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
Definition SIISelLowering.cpp:1123

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:448

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:401

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:541

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:574

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:414

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1197

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition TargetLowering.h:5193

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetOptions
Definition TargetOptions.h:118

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetTransformInfoImplBase::getInliningLastCallToStaticBonus
virtual int getInliningLastCallToStaticBonus() const
Definition TargetTransformInfoImpl.h:97

llvm::TargetTransformInfoImplBase::getDataLayout
virtual const DataLayout & getDataLayout() const
Definition TargetTransformInfoImpl.h:50

llvm::TargetTransformInfoImplBase::getMemcpyLoopResidualLoweringType
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
Definition TargetTransformInfoImpl.h:1013

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:271

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition TargetTransformInfo.h:274

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:275

llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition TargetTransformInfo.h:1617

llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition TargetTransformInfo.h:1208

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1208

llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition TargetTransformInfo.h:1208

llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition TargetTransformInfo.h:1208

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition TargetTransformInfo.h:297

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition TargetTransformInfo.h:1125

llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition TargetTransformInfo.h:1132

llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition TargetTransformInfo.h:1128

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition TargetTransformInfo.h:1136

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition TargetTransformInfo.h:1138

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition TargetTransformInfo.h:1126

llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition TargetTransformInfo.h:1134

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition TargetTransformInfo.h:1127

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition TargetTransformInfo.h:1133

llvm::TypeSize
Definition TypeSize.h:332

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::TypeSize::getScalable
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getInt64Ty
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:771

llvm::Type::getInt8Ty
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295

llvm::Type::getInt16Ty
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296

llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition Type.h:311

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::getContext
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101

llvm::VectorType
Base class of all SIMD vector types.
Definition DerivedTypes.h:430

llvm::cl::opt
Definition CommandLine.h:1429

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200

uint64_t

Analysis.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:38

llvm::AMDGPUAS::BUFFER_STRIDED_POINTER
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
Definition AMDGPUAddrSpace.h:45

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPUAddrSpace.h:35

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:30

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:31

llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::BUFFER_RESOURCE
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition AMDGPUAddrSpace.h:43

llvm::AMDGPU
Definition AMDGPUMetadataVerifier.h:34

llvm::AMDGPU::isShader
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1425

llvm::AMDGPU::isFlatGlobalAddrSpace
bool isFlatGlobalAddrSpace(unsigned AS)
Definition AMDGPUAddrSpace.h:86

llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition AMDGPUBaseInfo.cpp:3157

llvm::AMDGPU::isIntrinsicAlwaysUniform
bool isIntrinsicAlwaysUniform(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3323

llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3319

llvm::AMDGPU::isExtendedGlobalAddrSpace
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition AMDGPUAddrSpace.h:91

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::FPOpFusion::Fast
@ Fast
Definition TargetOptions.h:31

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:411

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:758

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:757

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:731

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:732

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:412

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:260

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:413

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:414

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:261

llvm::InlineConstants::getInstrCost
LLVM_ABI int getInstrCost()
Definition InlineCost.cpp:206

llvm::Intrinsic
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
Definition GenericSSAContext.h:27

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:751

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::m_AShr
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
Definition PatternMatch.h:1326

llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition PatternMatch.h:2966

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:168

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2772

llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition PatternMatch.h:1008

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:92

llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition PatternMatch.h:1320

llvm::PatternMatch::m_FNeg
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
Definition PatternMatch.h:1236

llvm::PatternMatch::m_FAbs
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Definition PatternMatch.h:2846

llvm::SI
Definition SIInstrInfo.h:1718

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::mdconst::extract_or_null
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:681

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Length
@ Length
Definition DWP.cpp:477

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649

llvm::findOptionMDForLoop
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition LoopInfo.cpp:1079

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734

llvm::computeKnownBits
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition ValueTracking.cpp:148

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
Definition AtomicOrdering.h:64

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399

llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
Definition IVDescriptors.h:48

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::ComputeValueVTs
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6658

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::DenormalMode::getPreserveSign
static constexpr DenormalMode getPreserveSign()
Definition FloatingPointMode.h:119

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition KnownBits.h:244

llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition TargetTransformInfo.h:73

llvm::SIModeRegisterDefaults
Definition SIModeRegisterDefaults.h:20

llvm::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition SIModeRegisterDefaults.h:85

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1162

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:677

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:547

llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition TargetTransformInfo.h:588

llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition TargetTransformInfo.h:555

llvm::TargetTransformInfo::UnrollingPreferences::UnrollVectorizedLoop
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Definition TargetTransformInfo.h:632

llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition TargetTransformInfo.h:630

llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition TargetTransformInfo.h:601

llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition TargetTransformInfo.h:572

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:608

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:604

llvm::cl::desc
Definition CommandLine.h:410