LLVM: lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp Source File

//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "NVPTXTargetTransformInfo.h"

#include "NVPTXUtilities.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/BasicTTIImpl.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/IntrinsicsNVPTX.h"

#include "llvm/IR/Value.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/NVPTXAddrSpace.h"

#include "llvm/Transforms/InstCombine/InstCombiner.h"

#include <optional>

using namespace llvm;


#define DEBUG_TYPE "NVPTXtti"


// Whether the given intrinsic reads threadIdx.x/y/z.


static bool readsThreadIndex(const IntrinsicInst *II) {

  switch (II->getIntrinsicID()) {

    default: return false;

    case Intrinsic::nvvm_read_ptx_sreg_tid_x:

    case Intrinsic::nvvm_read_ptx_sreg_tid_y:

    case Intrinsic::nvvm_read_ptx_sreg_tid_z:

      return true;

  }

}


static bool readsLaneId(const IntrinsicInst *II) {

  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;

}


// Whether the given intrinsic is an atomic instruction in PTX.


static bool isNVVMAtomic(const IntrinsicInst *II) {

  switch (II->getIntrinsicID()) {

  default:

    return false;

  case Intrinsic::nvvm_atomic_add_gen_f_cta:

  case Intrinsic::nvvm_atomic_add_gen_f_sys:

  case Intrinsic::nvvm_atomic_add_gen_i_cta:

  case Intrinsic::nvvm_atomic_add_gen_i_sys:

  case Intrinsic::nvvm_atomic_and_gen_i_cta:

  case Intrinsic::nvvm_atomic_and_gen_i_sys:

  case Intrinsic::nvvm_atomic_cas_gen_i_cta:

  case Intrinsic::nvvm_atomic_cas_gen_i_sys:

  case Intrinsic::nvvm_atomic_dec_gen_i_cta:

  case Intrinsic::nvvm_atomic_dec_gen_i_sys:

  case Intrinsic::nvvm_atomic_inc_gen_i_cta:

  case Intrinsic::nvvm_atomic_inc_gen_i_sys:

  case Intrinsic::nvvm_atomic_max_gen_i_cta:

  case Intrinsic::nvvm_atomic_max_gen_i_sys:

  case Intrinsic::nvvm_atomic_min_gen_i_cta:

  case Intrinsic::nvvm_atomic_min_gen_i_sys:

  case Intrinsic::nvvm_atomic_or_gen_i_cta:

  case Intrinsic::nvvm_atomic_or_gen_i_sys:

  case Intrinsic::nvvm_atomic_exch_gen_i_cta:

  case Intrinsic::nvvm_atomic_exch_gen_i_sys:

  case Intrinsic::nvvm_atomic_xor_gen_i_cta:

  case Intrinsic::nvvm_atomic_xor_gen_i_sys:

    return true;

  }

}


bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) const {

  // Without inter-procedural analysis, we conservatively assume that arguments

  // to __device__ functions are divergent.

  if (const Argument *Arg = dyn_cast<Argument>(V))

    return !isKernelFunction(*Arg->getParent());


  if (const Instruction *I = dyn_cast<Instruction>(V)) {

    // Without pointer analysis, we conservatively assume values loaded from

    // generic or local address space are divergent.

    if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {

      unsigned AS = LI->getPointerAddressSpace();

      return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;

    }

    // Atomic instructions may cause divergence. Atomic instructions are

    // executed sequentially across all threads in a warp. Therefore, an earlier

    // executed thread may see different memory inputs than a later executed

    // thread. For example, suppose *a = 0 initially.

    //

    //   atom.global.add.s32 d, [a], 1

    //

    // returns 0 for the first thread that enters the critical region, and 1 for

    // the second thread.

    if (I->isAtomic())

      return true;

    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {

      // Instructions that read threadIdx are obviously divergent.

      if (readsThreadIndex(II) || readsLaneId(II))

        return true;

      // Handle the NVPTX atomic intrinsics that cannot be represented as an

      // atomic IR instruction.

      if (isNVVMAtomic(II))

        return true;

    }

    // Conservatively consider the return value of function calls as divergent.

    // We could analyze callees with bodies more precisely using

    // inter-procedural analysis.

    if (isa<CallInst>(I))

      return true;

  }


  return false;

}


// Convert NVVM intrinsics to target-generic LLVM code where possible.


static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,

                                               IntrinsicInst *II) {

  // Each NVVM intrinsic we can simplify can be replaced with one of:

  //

  //  * an LLVM intrinsic,

  //  * an LLVM cast operation,

  //  * an LLVM binary operation, or

  //  * ad-hoc LLVM IR for the particular operation.


  // Some transformations are only valid when the module's

  // flush-denormals-to-zero (ftz) setting is true/false, whereas other

  // transformations are valid regardless of the module's ftz setting.

  enum FtzRequirementTy {

    FTZ_Any,       // Any ftz setting is ok.

    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.

    FTZ_MustBeOff, // Transformation is valid only if ftz is off.

  };

  // Classes of NVVM intrinsics that can't be replaced one-to-one with a

  // target-generic intrinsic, cast op, or binary op but that we can nonetheless

  // simplify.

  enum SpecialCase {

    SPC_Reciprocal,

    SCP_FunnelShiftClamp,

  };


  // SimplifyAction is a poor-man's variant (plus an additional flag) that

  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.

  struct SimplifyAction {

    // Invariant: At most one of these Optionals has a value.

    std::optional<Intrinsic::ID> IID;

    std::optional<Instruction::CastOps> CastOp;

    std::optional<Instruction::BinaryOps> BinaryOp;

    std::optional<SpecialCase> Special;


    FtzRequirementTy FtzRequirement = FTZ_Any;

    // Denormal handling is guarded by different attributes depending on the

    // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.

    bool IsHalfTy = false;


    SimplifyAction() = default;


    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,

                   bool IsHalfTy = false)

        : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}


    // Cast operations don't have anything to do with FTZ, so we skip that

    // argument.

    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}


    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)

        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}


    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)

        : Special(Special), FtzRequirement(FtzReq) {}

  };


  // Try to generate a SimplifyAction describing how to replace our

  // IntrinsicInstr with target-generic LLVM IR.

  const SimplifyAction Action = [II]() -> SimplifyAction {

    switch (II->getIntrinsicID()) {

    // NVVM intrinsics that map directly to LLVM intrinsics.

    case Intrinsic::nvvm_ceil_d:

      return {Intrinsic::ceil, FTZ_Any};

    case Intrinsic::nvvm_ceil_f:

      return {Intrinsic::ceil, FTZ_MustBeOff};

    case Intrinsic::nvvm_ceil_ftz_f:

      return {Intrinsic::ceil, FTZ_MustBeOn};

    case Intrinsic::nvvm_floor_d:

      return {Intrinsic::floor, FTZ_Any};

    case Intrinsic::nvvm_floor_f:

      return {Intrinsic::floor, FTZ_MustBeOff};

    case Intrinsic::nvvm_floor_ftz_f:

      return {Intrinsic::floor, FTZ_MustBeOn};

    case Intrinsic::nvvm_fma_rn_d:

      return {Intrinsic::fma, FTZ_Any};

    case Intrinsic::nvvm_fma_rn_f:

      return {Intrinsic::fma, FTZ_MustBeOff};

    case Intrinsic::nvvm_fma_rn_ftz_f:

      return {Intrinsic::fma, FTZ_MustBeOn};

    case Intrinsic::nvvm_fma_rn_f16:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_ftz_f16:

      return {Intrinsic::fma, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fma_rn_f16x2:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_ftz_f16x2:

      return {Intrinsic::fma, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fma_rn_bf16:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_ftz_bf16:

      return {Intrinsic::fma, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fma_rn_bf16x2:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_ftz_bf16x2:

      return {Intrinsic::fma, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_d:

      return {Intrinsic::maxnum, FTZ_Any};

    case Intrinsic::nvvm_fmax_f:

      return {Intrinsic::maxnum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmax_ftz_f:

      return {Intrinsic::maxnum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmax_nan_f:

      return {Intrinsic::maximum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmax_ftz_nan_f:

      return {Intrinsic::maximum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmax_f16:

      return {Intrinsic::maxnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_f16:

      return {Intrinsic::maxnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_f16x2:

      return {Intrinsic::maxnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_f16x2:

      return {Intrinsic::maxnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_nan_f16:

      return {Intrinsic::maximum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_nan_f16:

      return {Intrinsic::maximum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_nan_f16x2:

      return {Intrinsic::maximum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_nan_f16x2:

      return {Intrinsic::maximum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_d:

      return {Intrinsic::minnum, FTZ_Any};

    case Intrinsic::nvvm_fmin_f:

      return {Intrinsic::minnum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmin_ftz_f:

      return {Intrinsic::minnum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmin_nan_f:

      return {Intrinsic::minimum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmin_ftz_nan_f:

      return {Intrinsic::minimum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmin_f16:

      return {Intrinsic::minnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_f16:

      return {Intrinsic::minnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_f16x2:

      return {Intrinsic::minnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_f16x2:

      return {Intrinsic::minnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_nan_f16:

      return {Intrinsic::minimum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_nan_f16:

      return {Intrinsic::minimum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_nan_f16x2:

      return {Intrinsic::minimum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_nan_f16x2:

      return {Intrinsic::minimum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_sqrt_rn_d:

      return {Intrinsic::sqrt, FTZ_Any};

    case Intrinsic::nvvm_sqrt_f:

      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the

      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts

      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are

      // the versions with explicit ftz-ness.

      return {Intrinsic::sqrt, FTZ_Any};

    case Intrinsic::nvvm_trunc_d:

      return {Intrinsic::trunc, FTZ_Any};

    case Intrinsic::nvvm_trunc_f:

      return {Intrinsic::trunc, FTZ_MustBeOff};

    case Intrinsic::nvvm_trunc_ftz_f:

      return {Intrinsic::trunc, FTZ_MustBeOn};


    // NVVM intrinsics that map to LLVM cast operations.

    //

    // Note that llvm's target-generic conversion operators correspond to the rz

    // (round to zero) versions of the nvvm conversion intrinsics, even though

    // most everything else here uses the rn (round to nearest even) nvvm ops.

    case Intrinsic::nvvm_d2i_rz:

    case Intrinsic::nvvm_f2i_rz:

    case Intrinsic::nvvm_d2ll_rz:

    case Intrinsic::nvvm_f2ll_rz:

      return {Instruction::FPToSI};

    case Intrinsic::nvvm_d2ui_rz:

    case Intrinsic::nvvm_f2ui_rz:

    case Intrinsic::nvvm_d2ull_rz:

    case Intrinsic::nvvm_f2ull_rz:

      return {Instruction::FPToUI};

    // Integer to floating-point uses RN rounding, not RZ

    case Intrinsic::nvvm_i2d_rn:

    case Intrinsic::nvvm_i2f_rn:

    case Intrinsic::nvvm_ll2d_rn:

    case Intrinsic::nvvm_ll2f_rn:

      return {Instruction::SIToFP};

    case Intrinsic::nvvm_ui2d_rn:

    case Intrinsic::nvvm_ui2f_rn:

    case Intrinsic::nvvm_ull2d_rn:

    case Intrinsic::nvvm_ull2f_rn:

      return {Instruction::UIToFP};


    // NVVM intrinsics that map to LLVM binary ops.

    case Intrinsic::nvvm_div_rn_d:

      return {Instruction::FDiv, FTZ_Any};


    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but

    // need special handling.

    //

    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just

    // as well.

    case Intrinsic::nvvm_rcp_rn_d:

      return {SPC_Reciprocal, FTZ_Any};


    case Intrinsic::nvvm_fshl_clamp:

    case Intrinsic::nvvm_fshr_clamp:

      return {SCP_FunnelShiftClamp, FTZ_Any};


      // We do not currently simplify intrinsics that give an approximate

      // answer. These include:

      //

      //   - nvvm_cos_approx_{f,ftz_f}

      //   - nvvm_ex2_approx_{d,f,ftz_f}

      //   - nvvm_lg2_approx_{d,f,ftz_f}

      //   - nvvm_sin_approx_{f,ftz_f}

      //   - nvvm_sqrt_approx_{f,ftz_f}

      //   - nvvm_rsqrt_approx_{d,f,ftz_f}

      //   - nvvm_div_approx_{ftz_d,ftz_f,f}

      //   - nvvm_rcp_approx_ftz_d

      //

      // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"

      // means that fastmath is enabled in the intrinsic.  Unfortunately only

      // binary operators (currently) have a fastmath bit in SelectionDAG, so

      // this information gets lost and we can't select on it.

      //

      // TODO: div and rcp are lowered to a binary op, so these we could in

      // theory lower them to "fast fdiv".


    default:

      return {};

    }

  }();


  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we

  // can bail out now.  (Notice that in the case that IID is not an NVVM

  // intrinsic, we don't have to look up any module metadata, as

  // FtzRequirementTy will be FTZ_Any.)

  if (Action.FtzRequirement != FTZ_Any) {

    // FIXME: Broken for f64

    DenormalMode Mode = II->getFunction()->getDenormalMode(

        Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());

    bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;


    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))

      return nullptr;

  }


  // Simplify to target-generic intrinsic.

  if (Action.IID) {

    SmallVector<Value *, 4> Args(II->args());

    // All the target-generic intrinsics currently of interest to us have one

    // type argument, equal to that of the nvvm intrinsic's argument.

    Type *Tys[] = {II->getArgOperand(0)->getType()};

    return CallInst::Create(

        Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys),

        Args);

  }


  // Simplify to target-generic binary op.

  if (Action.BinaryOp)

    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),

                                  II->getArgOperand(1), II->getName());


  // Simplify to target-generic cast op.

  if (Action.CastOp)

    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),

                            II->getName());


  // All that's left are the special cases.

  if (!Action.Special)

    return nullptr;


  switch (*Action.Special) {

  case SPC_Reciprocal:

    // Simplify reciprocal.

    return BinaryOperator::Create(

        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),

        II->getArgOperand(0), II->getName());


  case SCP_FunnelShiftClamp: {

    // Canonicalize a clamping funnel shift to the generic llvm funnel shift

    // when possible, as this is easier for llvm to optimize further.

    if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {

      const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;

      if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())

        return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0));


      const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;

      return CallInst::Create(Intrinsic::getOrInsertDeclaration(

                                  II->getModule(), FshIID, II->getType()),

                              SmallVector<Value *, 3>(II->args()));

    }

    return nullptr;

  }

  }

  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");

}


// Returns true/false when we know the answer, nullopt otherwise.


static std::optional<bool> evaluateIsSpace(Intrinsic::ID IID, unsigned AS) {

  if (AS == NVPTXAS::ADDRESS_SPACE_GENERIC ||

      AS == NVPTXAS::ADDRESS_SPACE_PARAM)

    return std::nullopt; // Got to check at run-time.

  switch (IID) {

  case Intrinsic::nvvm_isspacep_global:

    return AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;

  case Intrinsic::nvvm_isspacep_local:

    return AS == NVPTXAS::ADDRESS_SPACE_LOCAL;

  case Intrinsic::nvvm_isspacep_shared:

    // If shared cluster this can't be evaluated at compile time.

    if (AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)

      return std::nullopt;

    return AS == NVPTXAS::ADDRESS_SPACE_SHARED;

  case Intrinsic::nvvm_isspacep_shared_cluster:

    return AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER ||

           AS == NVPTXAS::ADDRESS_SPACE_SHARED;

  case Intrinsic::nvvm_isspacep_const:

    return AS == NVPTXAS::ADDRESS_SPACE_CONST;

  default:

    llvm_unreachable("Unexpected intrinsic");

  }

}


// Returns an instruction pointer (may be nullptr if we do not know the answer).

// Returns nullopt if `II` is not one of the `isspacep` intrinsics.

//

// TODO: If InferAddressSpaces were run early enough in the pipeline this could

// be removed in favor of the constant folding that occurs there through

// rewriteIntrinsicWithAddressSpace

static std::optional<Instruction *>


handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II) {


  switch (auto IID = II.getIntrinsicID()) {

  case Intrinsic::nvvm_isspacep_global:

  case Intrinsic::nvvm_isspacep_local:

  case Intrinsic::nvvm_isspacep_shared:

  case Intrinsic::nvvm_isspacep_shared_cluster:

  case Intrinsic::nvvm_isspacep_const: {

    Value *Op0 = II.getArgOperand(0);

    unsigned AS = Op0->getType()->getPointerAddressSpace();

    // Peek through ASC to generic AS.

    // TODO: we could dig deeper through both ASCs and GEPs.

    if (AS == NVPTXAS::ADDRESS_SPACE_GENERIC)

      if (auto *ASCO = dyn_cast<AddrSpaceCastOperator>(Op0))

        AS = ASCO->getOperand(0)->getType()->getPointerAddressSpace();


    if (std::optional<bool> Answer = evaluateIsSpace(IID, AS))

      return IC.replaceInstUsesWith(II,

                                    ConstantInt::get(II.getType(), *Answer));

    return nullptr; // Don't know the answer, got to check at run time.

  }

  default:

    return std::nullopt;

  }

}


std::optional<Instruction *>


NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

  if (std::optional<Instruction *> I = handleSpaceCheckIntrinsics(IC, II))

    return *I;

  if (Instruction *I = convertNvvmIntrinsicToLlvm(IC, &II))

    return I;


  return std::nullopt;

}


InstructionCost


NVPTXTTIImpl::getInstructionCost(const User *U,

                                 ArrayRef<const Value *> Operands,

                                 TTI::TargetCostKind CostKind) const {

  if (const auto *CI = dyn_cast<CallInst>(U))

    if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {

      // Without this implementation getCallCost() would return the number

      // of arguments+1 as the cost. Because the cost-model assumes it is a call

      // since it is classified as a call in the IR. A better cost model would

      // be to return the number of asm instructions embedded in the asm

      // string.

      StringRef AsmStr = IA->getAsmString();

      const unsigned InstCount =

          count_if(split(AsmStr, ';'), [](StringRef AsmInst) {

            // Trim off scopes denoted by '{' and '}' as these can be ignored

            AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");

            // This is pretty coarse but does a reasonably good job of

            // identifying things that look like instructions, possibly with a

            // predicate ("@").

            return !AsmInst.empty() &&

                   (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||

                    AsmInst.find(".pragma") != StringRef::npos);

          });

      return InstCount * TargetTransformInfo::TCC_Basic;

    }


  return BaseT::getInstructionCost(U, Operands, CostKind);

}


InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {

  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  switch (ISD) {

  default:

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

                                         Op2Info);

  case ISD::ADD:

  case ISD::MUL:

  case ISD::XOR:

  case ISD::OR:

  case ISD::AND:

    // The machine code (SASS) simulates an i64 with two i32. Therefore, we

    // estimate that arithmetic operations on i64 are twice as expensive as

    // those on types that can fit into one machine register.

    if (LT.second.SimpleTy == MVT::i64)

      return 2 * LT.first;

    // Delegate other cases to the basic TTI.

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

                                         Op2Info);

  }

}


void NVPTXTTIImpl::getUnrollingPreferences(

    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

    OptimizationRemarkEmitter *ORE) const {

  BaseT::getUnrollingPreferences(L, SE, UP, ORE);


  // Enable partial unrolling and runtime unrolling, but reduce the

  // threshold.  This partially unrolls small loops which are often

  // unrolled by the PTX to SASS compiler and unrolling earlier can be

  // beneficial.

  UP.Partial = UP.Runtime = true;

  UP.PartialThreshold = UP.Threshold / 4;

}


void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,

                                              Intrinsic::ID IID) const {

  switch (IID) {

  case Intrinsic::nvvm_isspacep_const:

  case Intrinsic::nvvm_isspacep_global:

  case Intrinsic::nvvm_isspacep_local:

  case Intrinsic::nvvm_isspacep_shared:

  case Intrinsic::nvvm_isspacep_shared_cluster:

  case Intrinsic::nvvm_prefetch_tensormap: {

    OpIndexes.push_back(0);

    return true;

  }

  }

  return false;

}


Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,

                                                      Value *OldV,

                                                      Value *NewV) const {

  const Intrinsic::ID IID = II->getIntrinsicID();

  switch (IID) {

  case Intrinsic::nvvm_isspacep_const:

  case Intrinsic::nvvm_isspacep_global:

  case Intrinsic::nvvm_isspacep_local:

  case Intrinsic::nvvm_isspacep_shared:

  case Intrinsic::nvvm_isspacep_shared_cluster: {

    const unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    if (const auto R = evaluateIsSpace(IID, NewAS))

      return ConstantInt::get(II->getType(), *R);

    return nullptr;

  }

  case Intrinsic::nvvm_prefetch_tensormap: {

    IRBuilder<> Builder(II);

    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,

                                        NewV);

  }

  }

  return nullptr;

}


unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {

  // 256 bit loads/stores are currently only supported for global address space

  if (ST->has256BitVectorLoadStore(AddrSpace))

    return 256;

  return 128;

}


unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {

  if (isa<AllocaInst>(V))

    return ADDRESS_SPACE_LOCAL;


  if (const Argument *Arg = dyn_cast<Argument>(V)) {

    if (isKernelFunction(*Arg->getParent())) {

      const NVPTXTargetMachine &TM =

          static_cast<const NVPTXTargetMachine &>(getTLI()->getTargetMachine());

      if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr())

        return ADDRESS_SPACE_GLOBAL;

    } else {

      // We assume that all device parameters that are passed byval will be

      // placed in the local AS. Very simple cases will be updated after ISel to

      // use the device param space where possible.

      if (Arg->hasByValAttr())

        return ADDRESS_SPACE_LOCAL;

    }

  }


  return -1;

}


void NVPTXTTIImpl::collectKernelLaunchBounds(

    const Function &F,

    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {

  if (const auto Val = getMaxClusterRank(F))

    LB.push_back({"maxclusterrank", *Val});


  const auto MaxNTID = getMaxNTID(F);

  if (MaxNTID.size() > 0)

    LB.push_back({"maxntidx", MaxNTID[0]});

  if (MaxNTID.size() > 1)

    LB.push_back({"maxntidy", MaxNTID[1]});

  if (MaxNTID.size() > 2)

    LB.push_back({"maxntidz", MaxNTID[2]});

}


BasicTTIImpl.h
This file provides a helper that implements much of the TTI interface in terms of the target-independ...

Casting.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

IntrinsicInst.h

Value.h

InstCombiner.h
This file provides the interface for the instcombine pass implementation.

Intrinsics.h

LoopInfo.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

Operands
mir Rename Register Operands
Definition MIRNamerPass.cpp:74

NVPTXAddrSpace.h
NVPTX address space definition.

handleSpaceCheckIntrinsics
static std::optional< Instruction * > handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II)
Definition NVPTXTargetTransformInfo.cpp:448

isNVVMAtomic
static bool isNVVMAtomic(const IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:47

convertNvvmIntrinsicToLlvm
static Instruction * convertNvvmIntrinsicToLlvm(InstCombiner &IC, IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:121

readsLaneId
static bool readsLaneId(const IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:42

evaluateIsSpace
static std::optional< bool > evaluateIsSpace(Intrinsic::ID IID, unsigned AS)
Definition NVPTXTargetTransformInfo.cpp:417

readsThreadIndex
static bool readsThreadIndex(const IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:32

NVPTXTargetTransformInfo.h
This file a TargetTransformInfoImplBase conforming object specific to the NVPTX target machine.

NVPTXUtilities.h

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

ValueTracking.h

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1033

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition BasicTTIImpl.h:702

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:774

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:997

llvm::BinaryOperator::Create
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition Instructions.cpp:2703

llvm::CallInst::Create
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Definition Instructions.h:1545

llvm::CastInst::Create
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Definition Instructions.cpp:3039

llvm::Function
Definition Function.h:64

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780

llvm::InstCombiner
The core instruction combiner logic.
Definition InstCombiner.h:48

llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition InstCombiner.h:388

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::BinaryOps
BinaryOps
Definition Instruction.h:998

llvm::Instruction::CastOps
CastOps
Definition Instruction.h:1012

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:180

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
Definition NVPTXTargetTransformInfo.cpp:576

llvm::NVPTXTTIImpl::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
Definition NVPTXTargetTransformInfo.cpp:485

llvm::NVPTXTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
Definition NVPTXTargetTransformInfo.cpp:600

llvm::NVPTXTTIImpl::instCombineIntrinsic
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Definition NVPTXTargetTransformInfo.cpp:475

llvm::NVPTXTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition NVPTXTargetTransformInfo.cpp:513

llvm::NVPTXTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition NVPTXTargetTransformInfo.cpp:542

llvm::NVPTXTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition NVPTXTargetTransformInfo.cpp:555

llvm::NVPTXTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const override
Definition NVPTXTargetTransformInfo.cpp:77

llvm::NVPTXTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
Definition NVPTXTargetTransformInfo.cpp:560

llvm::NVPTXTTIImpl::getAssumedAddrSpace
unsigned getAssumedAddrSpace(const Value *V) const override
Definition NVPTXTargetTransformInfo.cpp:607

llvm::NVPTXTTIImpl::collectKernelLaunchBounds
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
Definition NVPTXTargetTransformInfo.cpp:629

llvm::NVPTXTargetMachine
NVPTXTargetMachine.
Definition NVPTXTargetMachine.h:25

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:448

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:574

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:414

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1197

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::StringRef::empty
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151

llvm::StringRef::ltrim
StringRef ltrim(char Char) const
Return string with consecutive Char characters starting from the the left removed.
Definition StringRef.h:800

llvm::StringRef::find
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:301

llvm::StringRef::trim
StringRef trim(char Char) const
Return string with consecutive Char characters starting from the left and right removed.
Definition StringRef.h:824

llvm::StringRef::npos
static constexpr size_t npos
Definition StringRef.h:57

llvm::TargetTransformInfoImplBase::getInstructionCost
virtual InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const
Definition TargetTransformInfoImpl.h:85

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:271

llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition TargetTransformInfo.h:298

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:771

llvm::User
Definition User.h:44

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:731

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:732

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:261

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:751

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::NVPTXAS::ADDRESS_SPACE_PARAM
@ ADDRESS_SPACE_PARAM
Definition NVPTXAddrSpace.h:30

llvm::NVPTXAS::ADDRESS_SPACE_LOCAL
@ ADDRESS_SPACE_LOCAL
Definition NVPTXAddrSpace.h:26

llvm::NVPTXAS::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition NVPTXAddrSpace.h:22

llvm::NVPTXAS::ADDRESS_SPACE_SHARED
@ ADDRESS_SPACE_SHARED
Definition NVPTXAddrSpace.h:24

llvm::NVPTXAS::ADDRESS_SPACE_CONST
@ ADDRESS_SPACE_CONST
Definition NVPTXAddrSpace.h:25

llvm::NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER
@ ADDRESS_SPACE_SHARED_CLUSTER
Definition NVPTXAddrSpace.h:28

llvm::NVPTXAS::ADDRESS_SPACE_GLOBAL
@ ADDRESS_SPACE_GLOBAL
Definition NVPTXAddrSpace.h:23

llvm::NVPTX::CUDA
@ CUDA
Definition NVPTX.h:118

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649

llvm::isAlpha
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
Definition StringExtras.h:118

llvm::split
iterator_range< SplittingIterator > split(StringRef Str, StringRef Separator)
Split the specified string over a separator and return a range-compatible iterable over its partition...
Definition StringExtras.h:611

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::getMaxClusterRank
std::optional< unsigned > getMaxClusterRank(const Function &F)
Definition NVPTXUtilities.cpp:319

llvm::getMaxNTID
SmallVector< unsigned, 3 > getMaxNTID(const Function &F)
Definition NVPTXUtilities.cpp:280

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963

llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition NVPTXUtilities.h:65

llvm::APFloatBase::IEEEsingle
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266

llvm::APFloatBase::IEEEhalf
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264

llvm::DenormalMode
Represent subnormal handling kind for floating point instruction inputs and outputs.
Definition FloatingPointMode.h:71

llvm::DenormalMode::PreserveSign
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Definition FloatingPointMode.h:81

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1162

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:677

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:547

llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition TargetTransformInfo.h:555

llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition TargetTransformInfo.h:572

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:608

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:604