LLVM: lib/Target/AMDGPU/SIInstrInfo.cpp Source File

//===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// SI Implementation of TargetInstrInfo.

//

//===----------------------------------------------------------------------===//


#include "SIInstrInfo.h"

#include "AMDGPU.h"

#include "AMDGPUInstrInfo.h"

#include "AMDGPULaneMaskUtils.h"

#include "GCNHazardRecognizer.h"

#include "GCNSubtarget.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"

#include "llvm/CodeGen/LiveIntervals.h"

#include "llvm/CodeGen/LiveVariables.h"

#include "llvm/CodeGen/MachineDominators.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineScheduler.h"

#include "llvm/CodeGen/RegisterScavenging.h"

#include "llvm/CodeGen/ScheduleDAG.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/MC/MCContext.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Target/TargetMachine.h"


using namespace llvm;


#define DEBUG_TYPE "si-instr-info"


#define GET_INSTRINFO_CTOR_DTOR

#include "AMDGPUGenInstrInfo.inc"


namespace llvm::AMDGPU {

#define GET_D16ImageDimIntrinsics_IMPL

#define GET_ImageDimIntrinsicTable_IMPL

#define GET_RsrcIntrinsics_IMPL

#include "AMDGPUGenSearchableTables.inc"

} // namespace llvm::AMDGPU


// Must be at least 4 to be able to branch over minimum unconditional branch

// code. This is only for making it possible to write reasonably small tests for

// long branches.

static cl::opt<unsigned>

BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),

                 cl::desc("Restrict range of branch instructions (DEBUG)"));


static cl::opt<bool> Fix16BitCopies(

  "amdgpu-fix-16-bit-physreg-copies",

  cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),

  cl::init(true),

  cl::ReallyHidden);


SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)

    : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),

      RI(ST), ST(ST) {

  SchedModel.init(&ST);

}


//===----------------------------------------------------------------------===//

// TargetInstrInfo callbacks

//===----------------------------------------------------------------------===//


static unsigned getNumOperandsNoGlue(SDNode *Node) {

  unsigned N = Node->getNumOperands();

  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)

    --N;

  return N;

}


/// Returns true if both nodes have the same value for the given

///        operand \p Op, or if both nodes do not have this operand.


static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1,

                                      AMDGPU::OpName OpName) {

  unsigned Opc0 = N0->getMachineOpcode();

  unsigned Opc1 = N1->getMachineOpcode();


  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);

  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);


  if (Op0Idx == -1 && Op1Idx == -1)

    return true;


  if ((Op0Idx == -1 && Op1Idx != -1) ||

      (Op1Idx == -1 && Op0Idx != -1))

    return false;


  // getNamedOperandIdx returns the index for the MachineInstr's operands,

  // which includes the result as the first operand. We are indexing into the

  // MachineSDNode's operands, so we need to skip the result operand to get

  // the real index.

  --Op0Idx;

  --Op1Idx;


  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);

}


static bool canRemat(const MachineInstr &MI) {


  if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||

      SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||

      SIInstrInfo::isSALU(MI))

    return true;


  if (SIInstrInfo::isSMRD(MI)) {

    return !MI.memoperands_empty() &&

           llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {

             return MMO->isLoad() && MMO->isInvariant();

           });

  }


  return false;

}


bool SIInstrInfo::isReallyTriviallyReMaterializable(

    const MachineInstr &MI) const {


  if (canRemat(MI)) {

    // Normally VALU use of exec would block the rematerialization, but that

    // is OK in this case to have an implicit exec read as all VALU do.

    // We really want all of the generic logic for this except for this.


    // Another potential implicit use is mode register. The core logic of

    // the RA will not attempt rematerialization if mode is set anywhere

    // in the function, otherwise it is safe since mode is not changed.


    // There is difference to generic method which does not allow

    // rematerialization if there are virtual register uses. We allow this,

    // therefore this method includes SOP instructions as well.

    if (!MI.hasImplicitDef() &&

        MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&

        !MI.mayRaiseFPException())

      return true;

  }


  return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);

}


// Returns true if the scalar result of a VALU instruction depends on exec.

bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {

  // Ignore comparisons which are only used masked with exec.

  // This allows some hoisting/sinking of VALU comparisons.

  if (MI.isCompare()) {

    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);

    if (!Dst)

      return true;


    Register DstReg = Dst->getReg();

    if (!DstReg.isVirtual())

      return true;


    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

    for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {

      switch (Use.getOpcode()) {

      case AMDGPU::S_AND_SAVEEXEC_B32:

      case AMDGPU::S_AND_SAVEEXEC_B64:

        break;

      case AMDGPU::S_AND_B32:

      case AMDGPU::S_AND_B64:

        if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))

          return true;

        break;

      default:

        return true;

      }

    }

    return false;

  }


  switch (MI.getOpcode()) {

  default:

    break;

  case AMDGPU::V_READFIRSTLANE_B32:

    return true;

  }


  return false;

}


bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {

  // Any implicit use of exec by VALU is not a real register read.

  return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&

         isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());

}


bool SIInstrInfo::isSafeToSink(MachineInstr &MI,

                               MachineBasicBlock *SuccToSinkTo,

                               MachineCycleInfo *CI) const {

  // Allow sinking if MI edits lane mask (divergent i1 in sgpr).

  if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)

    return true;


  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

  // Check if sinking of MI would create temporal divergent use.

  for (auto Op : MI.uses()) {

    if (Op.isReg() && Op.getReg().isVirtual() &&

        RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {

      MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());


      // SgprDef defined inside cycle

      MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());

      if (FromCycle == nullptr)

        continue;


      MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);

      // Check if there is a FromCycle that contains SgprDef's basic block but

      // does not contain SuccToSinkTo and also has divergent exit condition.

      while (FromCycle && !FromCycle->contains(ToCycle)) {

        SmallVector<MachineBasicBlock *, 1> ExitingBlocks;

        FromCycle->getExitingBlocks(ExitingBlocks);


        // FromCycle has divergent exit condition.

        for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {

          if (hasDivergentBranch(ExitingBlock))

            return false;

        }


        FromCycle = FromCycle->getParentCycle();

      }

    }

  }


  return true;

}


bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,

                                          int64_t &Offset0,

                                          int64_t &Offset1) const {

  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())

    return false;


  unsigned Opc0 = Load0->getMachineOpcode();

  unsigned Opc1 = Load1->getMachineOpcode();


  // Make sure both are actually loads.

  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())

    return false;


  // A mayLoad instruction without a def is not a load. Likely a prefetch.

  if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())

    return false;


  if (isDS(Opc0) && isDS(Opc1)) {


    // FIXME: Handle this case:

    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))

      return false;


    // Check base reg.

    if (Load0->getOperand(0) != Load1->getOperand(0))

      return false;


    // Skip read2 / write2 variants for simplicity.

    // TODO: We should report true if the used offsets are adjacent (excluded

    // st64 versions).

    int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);

    int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);

    if (Offset0Idx == -1 || Offset1Idx == -1)

      return false;


    // XXX - be careful of dataless loads

    // getNamedOperandIdx returns the index for MachineInstrs.  Since they

    // include the output in the operand list, but SDNodes don't, we need to

    // subtract the index by one.

    Offset0Idx -= get(Opc0).NumDefs;

    Offset1Idx -= get(Opc1).NumDefs;

    Offset0 = Load0->getConstantOperandVal(Offset0Idx);

    Offset1 = Load1->getConstantOperandVal(Offset1Idx);

    return true;

  }


  if (isSMRD(Opc0) && isSMRD(Opc1)) {

    // Skip time and cache invalidation instructions.

    if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||

        !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))

      return false;


    unsigned NumOps = getNumOperandsNoGlue(Load0);

    if (NumOps != getNumOperandsNoGlue(Load1))

      return false;


    // Check base reg.

    if (Load0->getOperand(0) != Load1->getOperand(0))

      return false;


    // Match register offsets, if both register and immediate offsets present.

    assert(NumOps == 4 || NumOps == 5);

    if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))

      return false;


    const ConstantSDNode *Load0Offset =

        dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));

    const ConstantSDNode *Load1Offset =

        dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));


    if (!Load0Offset || !Load1Offset)

      return false;


    Offset0 = Load0Offset->getZExtValue();

    Offset1 = Load1Offset->getZExtValue();

    return true;

  }


  // MUBUF and MTBUF can access the same addresses.

  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {


    // MUBUF and MTBUF have vaddr at different indices.

    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||

        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||

        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))

      return false;


    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);

    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);


    if (OffIdx0 == -1 || OffIdx1 == -1)

      return false;


    // getNamedOperandIdx returns the index for MachineInstrs.  Since they

    // include the output in the operand list, but SDNodes don't, we need to

    // subtract the index by one.

    OffIdx0 -= get(Opc0).NumDefs;

    OffIdx1 -= get(Opc1).NumDefs;


    SDValue Off0 = Load0->getOperand(OffIdx0);

    SDValue Off1 = Load1->getOperand(OffIdx1);


    // The offset might be a FrameIndexSDNode.

    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))

      return false;


    Offset0 = Off0->getAsZExtVal();

    Offset1 = Off1->getAsZExtVal();

    return true;

  }


  return false;

}


static bool isStride64(unsigned Opc) {

  switch (Opc) {

  case AMDGPU::DS_READ2ST64_B32:

  case AMDGPU::DS_READ2ST64_B64:

  case AMDGPU::DS_WRITE2ST64_B32:

  case AMDGPU::DS_WRITE2ST64_B64:

    return true;

  default:

    return false;

  }

}


bool SIInstrInfo::getMemOperandsWithOffsetWidth(

    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,

    int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,

    const TargetRegisterInfo *TRI) const {

  if (!LdSt.mayLoadOrStore())

    return false;


  unsigned Opc = LdSt.getOpcode();

  OffsetIsScalable = false;

  const MachineOperand *BaseOp, *OffsetOp;

  int DataOpIdx;


  if (isDS(LdSt)) {

    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);

    OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);

    if (OffsetOp) {

      // Normal, single offset LDS instruction.

      if (!BaseOp) {

        // DS_CONSUME/DS_APPEND use M0 for the base address.

        // TODO: find the implicit use operand for M0 and use that as BaseOp?

        return false;

      }

      BaseOps.push_back(BaseOp);

      Offset = OffsetOp->getImm();

      // Get appropriate operand, and compute width accordingly.

      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);

      if (DataOpIdx == -1)

        DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);

      if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)

        Width = LocationSize::precise(64);

      else

        Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

    } else {

      // The 2 offset instructions use offset0 and offset1 instead. We can treat

      // these as a load with a single offset if the 2 offsets are consecutive.

      // We will use this for some partially aligned loads.

      const MachineOperand *Offset0Op =

          getNamedOperand(LdSt, AMDGPU::OpName::offset0);

      const MachineOperand *Offset1Op =

          getNamedOperand(LdSt, AMDGPU::OpName::offset1);


      unsigned Offset0 = Offset0Op->getImm() & 0xff;

      unsigned Offset1 = Offset1Op->getImm() & 0xff;

      if (Offset0 + 1 != Offset1)

        return false;


      // Each of these offsets is in element sized units, so we need to convert

      // to bytes of the individual reads.


      unsigned EltSize;

      if (LdSt.mayLoad())

        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;

      else {

        assert(LdSt.mayStore());

        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);

        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;

      }


      if (isStride64(Opc))

        EltSize *= 64;


      BaseOps.push_back(BaseOp);

      Offset = EltSize * Offset0;

      // Get appropriate operand(s), and compute width accordingly.

      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);

      if (DataOpIdx == -1) {

        DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);

        Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

        DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);

        Width = LocationSize::precise(

            Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));

      } else {

        Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

      }

    }

    return true;

  }


  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {

    const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);

    if (!RSrc) // e.g. BUFFER_WBINVL1_VOL

      return false;

    BaseOps.push_back(RSrc);

    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);

    if (BaseOp && !BaseOp->isFI())

      BaseOps.push_back(BaseOp);

    const MachineOperand *OffsetImm =

        getNamedOperand(LdSt, AMDGPU::OpName::offset);

    Offset = OffsetImm->getImm();

    const MachineOperand *SOffset =

        getNamedOperand(LdSt, AMDGPU::OpName::soffset);

    if (SOffset) {

      if (SOffset->isReg())

        BaseOps.push_back(SOffset);

      else

        Offset += SOffset->getImm();

    }

    // Get appropriate operand, and compute width accordingly.

    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);

    if (DataOpIdx == -1)

      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);

    if (DataOpIdx == -1) // LDS DMA

      return false;

    Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

    return true;

  }


  if (isImage(LdSt)) {

    auto RsrcOpName =

        isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;

    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);

    BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));

    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);

    if (VAddr0Idx >= 0) {

      // GFX10 possible NSA encoding.

      for (int I = VAddr0Idx; I < SRsrcIdx; ++I)

        BaseOps.push_back(&LdSt.getOperand(I));

    } else {

      BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));

    }

    Offset = 0;

    // Get appropriate operand, and compute width accordingly.

    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);

    if (DataOpIdx == -1)

      return false; // no return sampler

    Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

    return true;

  }


  if (isSMRD(LdSt)) {

    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);

    if (!BaseOp) // e.g. S_MEMTIME

      return false;

    BaseOps.push_back(BaseOp);

    OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);

    Offset = OffsetOp ? OffsetOp->getImm() : 0;

    // Get appropriate operand, and compute width accordingly.

    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);

    if (DataOpIdx == -1)

      return false;

    Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

    return true;

  }


  if (isFLAT(LdSt)) {

    // Instructions have either vaddr or saddr or both or none.

    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);

    if (BaseOp)

      BaseOps.push_back(BaseOp);

    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);

    if (BaseOp)

      BaseOps.push_back(BaseOp);

    Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();

    // Get appropriate operand, and compute width accordingly.

    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);

    if (DataOpIdx == -1)

      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);

    if (DataOpIdx == -1) // LDS DMA

      return false;

    Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));

    return true;

  }


  return false;

}


static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,

                                  ArrayRef<const MachineOperand *> BaseOps1,

                                  const MachineInstr &MI2,

                                  ArrayRef<const MachineOperand *> BaseOps2) {

  // Only examine the first "base" operand of each instruction, on the

  // assumption that it represents the real base address of the memory access.

  // Other operands are typically offsets or indices from this base address.

  if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))

    return true;


  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())

    return false;


  auto *MO1 = *MI1.memoperands_begin();

  auto *MO2 = *MI2.memoperands_begin();

  if (MO1->getAddrSpace() != MO2->getAddrSpace())

    return false;


  const auto *Base1 = MO1->getValue();

  const auto *Base2 = MO2->getValue();

  if (!Base1 || !Base2)

    return false;

  Base1 = getUnderlyingObject(Base1);

  Base2 = getUnderlyingObject(Base2);


  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))

    return false;


  return Base1 == Base2;

}


bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,

                                      int64_t Offset1, bool OffsetIsScalable1,

                                      ArrayRef<const MachineOperand *> BaseOps2,

                                      int64_t Offset2, bool OffsetIsScalable2,

                                      unsigned ClusterSize,

                                      unsigned NumBytes) const {

  // If the mem ops (to be clustered) do not have the same base ptr, then they

  // should not be clustered

  unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;

  if (!BaseOps1.empty() && !BaseOps2.empty()) {

    const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();

    const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();

    if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))

      return false;


    const SIMachineFunctionInfo *MFI =

        FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();

    MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();

  } else if (!BaseOps1.empty() || !BaseOps2.empty()) {

    // If only one base op is empty, they do not have the same base ptr

    return false;

  }


  // In order to avoid register pressure, on an average, the number of DWORDS

  // loaded together by all clustered mem ops should not exceed

  // MaxMemoryClusterDWords. This is an empirical value based on certain

  // observations and performance related experiments.

  // The good thing about this heuristic is - it avoids clustering of too many

  // sub-word loads, and also avoids clustering of wide loads. Below is the

  // brief summary of how the heuristic behaves for various `LoadSize` when

  // MaxMemoryClusterDWords is 8.

  //

  // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops

  // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops

  // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops

  // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops

  // (5) LoadSize >= 17: do not cluster

  const unsigned LoadSize = NumBytes / ClusterSize;

  const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;

  return NumDWords <= MaxMemoryClusterDWords;

}


// FIXME: This behaves strangely. If, for example, you have 32 load + stores,

// the first 16 loads will be interleaved with the stores, and the next 16 will

// be clustered as expected. It should really split into 2 16 store batches.

//

// Loads are clustered until this returns false, rather than trying to schedule

// groups of stores. This also means we have to deal with saying different

// address space loads should be clustered, and ones which might cause bank

// conflicts.

//

// This might be deprecated so it might not be worth that much effort to fix.


bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,

                                          int64_t Offset0, int64_t Offset1,

                                          unsigned NumLoads) const {

  assert(Offset1 > Offset0 &&

         "Second offset should be larger than first offset!");

  // If we have less than 16 loads in a row, and the offsets are within 64

  // bytes, then schedule together.


  // A cacheline is 64 bytes (for global memory).

  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);

}


static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,

                              MachineBasicBlock::iterator MI,

                              const DebugLoc &DL, MCRegister DestReg,

                              MCRegister SrcReg, bool KillSrc,

                              const char *Msg = "illegal VGPR to SGPR copy") {

  MachineFunction *MF = MBB.getParent();


  LLVMContext &C = MF->getFunction().getContext();

  C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));


  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)

      .addReg(SrcReg, getKillRegState(KillSrc));

}


/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not

/// possible to have a direct copy in these cases on GFX908, so an intermediate

/// VGPR copy is required.


static void indirectCopyToAGPR(const SIInstrInfo &TII,

                               MachineBasicBlock &MBB,

                               MachineBasicBlock::iterator MI,

                               const DebugLoc &DL, MCRegister DestReg,

                               MCRegister SrcReg, bool KillSrc,

                               RegScavenger &RS, bool RegsOverlap,

                               Register ImpDefSuperReg = Register(),

                               Register ImpUseSuperReg = Register()) {

  assert((TII.getSubtarget().hasMAIInsts() &&

          !TII.getSubtarget().hasGFX90AInsts()) &&

         "Expected GFX908 subtarget.");


  assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||

          AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&

         "Source register of the copy should be either an SGPR or an AGPR.");


  assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&

         "Destination register of the copy should be an AGPR.");


  const SIRegisterInfo &RI = TII.getRegisterInfo();


  // First try to find defining accvgpr_write to avoid temporary registers.

  // In the case of copies of overlapping AGPRs, we conservatively do not

  // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up

  // an accvgpr_write used for this same copy due to implicit-defs

  if (!RegsOverlap) {

    for (auto Def = MI, E = MBB.begin(); Def != E; ) {

      --Def;


      if (!Def->modifiesRegister(SrcReg, &RI))

        continue;


      if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||

          Def->getOperand(0).getReg() != SrcReg)

        break;


      MachineOperand &DefOp = Def->getOperand(1);

      assert(DefOp.isReg() || DefOp.isImm());


      if (DefOp.isReg()) {

        bool SafeToPropagate = true;

        // Check that register source operand is not clobbered before MI.

        // Immediate operands are always safe to propagate.

        for (auto I = Def; I != MI && SafeToPropagate; ++I)

          if (I->modifiesRegister(DefOp.getReg(), &RI))

            SafeToPropagate = false;


        if (!SafeToPropagate)

          break;


        for (auto I = Def; I != MI; ++I)

          I->clearRegisterKills(DefOp.getReg(), &RI);

      }


      MachineInstrBuilder Builder =

        BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)

        .add(DefOp);

      if (ImpDefSuperReg)

        Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);


      if (ImpUseSuperReg) {

        Builder.addReg(ImpUseSuperReg,

                      getKillRegState(KillSrc) | RegState::Implicit);

      }


      return;

    }

  }


  RS.enterBasicBlockEnd(MBB);

  RS.backward(std::next(MI));


  // Ideally we want to have three registers for a long reg_sequence copy

  // to hide 2 waitstates between v_mov_b32 and accvgpr_write.

  unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,

                                             *MBB.getParent());


  // Registers in the sequence are allocated contiguously so we can just

  // use register number to pick one of three round-robin temps.

  unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;

  Register Tmp =

      MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();

  assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&

         "VGPR used for an intermediate copy should have been reserved.");


  // Only loop through if there are any free registers left. We don't want to

  // spill.

  while (RegNo--) {

    Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,

                                                 /* RestoreAfter */ false, 0,

                                                 /* AllowSpill */ false);

    if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)

      break;

    Tmp = Tmp2;

    RS.setRegUsed(Tmp);

  }


  // Insert copy to temporary VGPR.

  unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;

  if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {

    TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;

  } else {

    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));

  }


  MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)

    .addReg(SrcReg, getKillRegState(KillSrc));

  if (ImpUseSuperReg) {

    UseBuilder.addReg(ImpUseSuperReg,

                      getKillRegState(KillSrc) | RegState::Implicit);

  }


  MachineInstrBuilder DefBuilder

    = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)

    .addReg(Tmp, RegState::Kill);


  if (ImpDefSuperReg)

    DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);

}


static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,

                           MachineBasicBlock::iterator MI, const DebugLoc &DL,

                           MCRegister DestReg, MCRegister SrcReg, bool KillSrc,

                           const TargetRegisterClass *RC, bool Forward) {

  const SIRegisterInfo &RI = TII.getRegisterInfo();

  ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);

  MachineBasicBlock::iterator I = MI;

  MachineInstr *FirstMI = nullptr, *LastMI = nullptr;


  for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {

    int16_t SubIdx = BaseIndices[Idx];

    Register DestSubReg = RI.getSubReg(DestReg, SubIdx);

    Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);

    assert(DestSubReg && SrcSubReg && "Failed to find subregs!");

    unsigned Opcode = AMDGPU::S_MOV_B32;


    // Is SGPR aligned? If so try to combine with next.

    bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;

    bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;

    if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {

      // Can use SGPR64 copy

      unsigned Channel = RI.getChannelFromSubReg(SubIdx);

      SubIdx = RI.getSubRegFromChannel(Channel, 2);

      DestSubReg = RI.getSubReg(DestReg, SubIdx);

      SrcSubReg = RI.getSubReg(SrcReg, SubIdx);

      assert(DestSubReg && SrcSubReg && "Failed to find subregs!");

      Opcode = AMDGPU::S_MOV_B64;

      Idx++;

    }


    LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)

                 .addReg(SrcSubReg)

                 .addReg(SrcReg, RegState::Implicit);


    if (!FirstMI)

      FirstMI = LastMI;


    if (!Forward)

      I--;

  }


  assert(FirstMI && LastMI);

  if (!Forward)

    std::swap(FirstMI, LastMI);


  FirstMI->addOperand(

      MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));


  if (KillSrc)

    LastMI->addRegisterKilled(SrcReg, &RI);

}


void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,

                              MachineBasicBlock::iterator MI,

                              const DebugLoc &DL, Register DestReg,

                              Register SrcReg, bool KillSrc, bool RenamableDest,

                              bool RenamableSrc) const {

  const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);

  unsigned Size = RI.getRegSizeInBits(*RC);

  const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);

  unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);


  // The rest of copyPhysReg assumes Src and Dst size are the same size.

  // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can

  // we remove Fix16BitCopies and this code block?

  if (Fix16BitCopies) {

    if (((Size == 16) != (SrcSize == 16))) {

      // Non-VGPR Src and Dst will later be expanded back to 32 bits.

      assert(ST.useRealTrue16Insts());

      Register &RegToFix = (Size == 32) ? DestReg : SrcReg;

      MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);

      RegToFix = SubReg;


      if (DestReg == SrcReg) {

        // Identity copy. Insert empty bundle since ExpandPostRA expects an

        // instruction here.

        BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));

        return;

      }

      RC = RI.getPhysRegBaseClass(DestReg);

      Size = RI.getRegSizeInBits(*RC);

      SrcRC = RI.getPhysRegBaseClass(SrcReg);

      SrcSize = RI.getRegSizeInBits(*SrcRC);

    }

  }


  if (RC == &AMDGPU::VGPR_32RegClass) {

    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||

           AMDGPU::SReg_32RegClass.contains(SrcReg) ||

           AMDGPU::AGPR_32RegClass.contains(SrcReg));

    unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?

                     AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;

    BuildMI(MBB, MI, DL, get(Opc), DestReg)

      .addReg(SrcReg, getKillRegState(KillSrc));

    return;

  }


  if (RC == &AMDGPU::SReg_32_XM0RegClass ||

      RC == &AMDGPU::SReg_32RegClass) {

    if (SrcReg == AMDGPU::SCC) {

      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)

          .addImm(1)

          .addImm(0);

      return;

    }


    if (DestReg == AMDGPU::VCC_LO) {

      if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {

        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)

          .addReg(SrcReg, getKillRegState(KillSrc));

      } else {

        // FIXME: Hack until VReg_1 removed.

        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));

        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))

          .addImm(0)

          .addReg(SrcReg, getKillRegState(KillSrc));

      }


      return;

    }


    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {

      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);

      return;

    }


    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)

            .addReg(SrcReg, getKillRegState(KillSrc));

    return;

  }


  if (RC == &AMDGPU::SReg_64RegClass) {

    if (SrcReg == AMDGPU::SCC) {

      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)

          .addImm(1)

          .addImm(0);

      return;

    }


    if (DestReg == AMDGPU::VCC) {

      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {

        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)

          .addReg(SrcReg, getKillRegState(KillSrc));

      } else {

        // FIXME: Hack until VReg_1 removed.

        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));

        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))

          .addImm(0)

          .addReg(SrcReg, getKillRegState(KillSrc));

      }


      return;

    }


    if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {

      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);

      return;

    }


    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)

            .addReg(SrcReg, getKillRegState(KillSrc));

    return;

  }


  if (DestReg == AMDGPU::SCC) {

    // Copying 64-bit or 32-bit sources to SCC barely makes sense,

    // but SelectionDAG emits such copies for i1 sources.

    if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {

      // This copy can only be produced by patterns

      // with explicit SCC, which are known to be enabled

      // only for subtargets with S_CMP_LG_U64 present.

      assert(ST.hasScalarCompareEq64());

      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))

          .addReg(SrcReg, getKillRegState(KillSrc))

          .addImm(0);

    } else {

      assert(AMDGPU::SReg_32RegClass.contains(SrcReg));

      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))

          .addReg(SrcReg, getKillRegState(KillSrc))

          .addImm(0);

    }


    return;

  }


  if (RC == &AMDGPU::AGPR_32RegClass) {

    if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||

        (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {

      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)

        .addReg(SrcReg, getKillRegState(KillSrc));

      return;

    }


    if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {

      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)

        .addReg(SrcReg, getKillRegState(KillSrc));

      return;

    }


    // FIXME: Pass should maintain scavenger to avoid scan through the block on

    // every AGPR spill.

    RegScavenger RS;

    const bool Overlap = RI.regsOverlap(SrcReg, DestReg);

    indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);

    return;

  }


  if (Size == 16) {

    assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||

           AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||

           AMDGPU::AGPR_LO16RegClass.contains(SrcReg));


    bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);

    bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);

    bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);

    bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);

    bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);

    bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);

    MCRegister NewDestReg = RI.get32BitRegister(DestReg);

    MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);


    if (IsSGPRDst) {

      if (!IsSGPRSrc) {

        reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);

        return;

      }


      BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)

        .addReg(NewSrcReg, getKillRegState(KillSrc));

      return;

    }


    if (IsAGPRDst || IsAGPRSrc) {

      if (!DstLow || !SrcLow) {

        reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,

                          "Cannot use hi16 subreg with an AGPR!");

      }


      copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);

      return;

    }


    if (ST.useRealTrue16Insts()) {

      if (IsSGPRSrc) {

        assert(SrcLow);

        SrcReg = NewSrcReg;

      }

      // Use the smaller instruction encoding if possible.

      if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&

          (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {

        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)

            .addReg(SrcReg);

      } else {

        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)

            .addImm(0) // src0_modifiers

            .addReg(SrcReg)

            .addImm(0); // op_sel

      }

      return;

    }


    if (IsSGPRSrc && !ST.hasSDWAScalar()) {

      if (!DstLow || !SrcLow) {

        reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,

                          "Cannot use hi16 subreg on VI!");

      }


      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)

        .addReg(NewSrcReg, getKillRegState(KillSrc));

      return;

    }


    auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)

      .addImm(0) // src0_modifiers

      .addReg(NewSrcReg)

      .addImm(0) // clamp

      .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0

                     : AMDGPU::SDWA::SdwaSel::WORD_1)

      .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)

      .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0

                     : AMDGPU::SDWA::SdwaSel::WORD_1)

      .addReg(NewDestReg, RegState::Implicit | RegState::Undef);

    // First implicit operand is $exec.

    MIB->tieOperands(0, MIB->getNumOperands() - 1);

    return;

  }


  if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {

    if (ST.hasMovB64()) {

      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)

        .addReg(SrcReg, getKillRegState(KillSrc));

      return;

    }

    if (ST.hasPkMovB32()) {

      BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)

        .addImm(SISrcMods::OP_SEL_1)

        .addReg(SrcReg)

        .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)

        .addReg(SrcReg)

        .addImm(0) // op_sel_lo

        .addImm(0) // op_sel_hi

        .addImm(0) // neg_lo

        .addImm(0) // neg_hi

        .addImm(0) // clamp

        .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);

      return;

    }

  }


  const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);

  if (RI.isSGPRClass(RC)) {

    if (!RI.isSGPRClass(SrcRC)) {

      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);

      return;

    }

    const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);

    expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,

                   Forward);

    return;

  }


  unsigned EltSize = 4;

  unsigned Opcode = AMDGPU::V_MOV_B32_e32;

  if (RI.isAGPRClass(RC)) {

    if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))

      Opcode = AMDGPU::V_ACCVGPR_MOV_B32;

    else if (RI.hasVGPRs(SrcRC) ||

             (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))

      Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;

    else

      Opcode = AMDGPU::INSTRUCTION_LIST_END;

  } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {

    Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;

  } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&

             (RI.isProperlyAlignedRC(*RC) &&

              (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {

    // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.

    if (ST.hasMovB64()) {

      Opcode = AMDGPU::V_MOV_B64_e32;

      EltSize = 8;

    } else if (ST.hasPkMovB32()) {

      Opcode = AMDGPU::V_PK_MOV_B32;

      EltSize = 8;

    }

  }


  // For the cases where we need an intermediate instruction/temporary register

  // (destination is an AGPR), we need a scavenger.

  //

  // FIXME: The pass should maintain this for us so we don't have to re-scan the

  // whole block for every handled copy.

  std::unique_ptr<RegScavenger> RS;

  if (Opcode == AMDGPU::INSTRUCTION_LIST_END)

    RS = std::make_unique<RegScavenger>();


  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);


  // If there is an overlap, we can't kill the super-register on the last

  // instruction, since it will also kill the components made live by this def.

  const bool Overlap = RI.regsOverlap(SrcReg, DestReg);

  const bool CanKillSuperReg = KillSrc && !Overlap;


  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {

    unsigned SubIdx;

    if (Forward)

      SubIdx = SubIndices[Idx];

    else

      SubIdx = SubIndices[SubIndices.size() - Idx - 1];

    Register DestSubReg = RI.getSubReg(DestReg, SubIdx);

    Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);

    assert(DestSubReg && SrcSubReg && "Failed to find subregs!");


    bool IsFirstSubreg = Idx == 0;

    bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;


    if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {

      Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();

      Register ImpUseSuper = SrcReg;

      indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,

                         *RS, Overlap, ImpDefSuper, ImpUseSuper);

    } else if (Opcode == AMDGPU::V_PK_MOV_B32) {

      MachineInstrBuilder MIB =

          BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)

              .addImm(SISrcMods::OP_SEL_1)

              .addReg(SrcSubReg)

              .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)

              .addReg(SrcSubReg)

              .addImm(0) // op_sel_lo

              .addImm(0) // op_sel_hi

              .addImm(0) // neg_lo

              .addImm(0) // neg_hi

              .addImm(0) // clamp

              .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);

      if (IsFirstSubreg)

        MIB.addReg(DestReg, RegState::Define | RegState::Implicit);

    } else {

      MachineInstrBuilder Builder =

          BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);

      if (IsFirstSubreg)

        Builder.addReg(DestReg, RegState::Define | RegState::Implicit);


      Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);

    }

  }

}


int SIInstrInfo::commuteOpcode(unsigned Opcode) const {

  int NewOpc;


  // Try to map original to commuted opcode

  NewOpc = AMDGPU::getCommuteRev(Opcode);

  if (NewOpc != -1)

    // Check if the commuted (REV) opcode exists on the target.

    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;


  // Try to map commuted to original opcode

  NewOpc = AMDGPU::getCommuteOrig(Opcode);

  if (NewOpc != -1)

    // Check if the original (non-REV) opcode exists on the target.

    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;


  return Opcode;

}


const TargetRegisterClass *


SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {

  return &AMDGPU::VGPR_32RegClass;

}


void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,

                                     MachineBasicBlock::iterator I,

                                     const DebugLoc &DL, Register DstReg,

                                     ArrayRef<MachineOperand> Cond,

                                     Register TrueReg,

                                     Register FalseReg) const {

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&

         "Not a VGPR32 reg");


  if (Cond.size() == 1) {

    Register SReg = MRI.createVirtualRegister(BoolXExecRC);

    BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)

      .add(Cond[0]);

    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

      .addImm(0)

      .addReg(FalseReg)

      .addImm(0)

      .addReg(TrueReg)

      .addReg(SReg);

  } else if (Cond.size() == 2) {

    assert(Cond[0].isImm() && "Cond[0] is not an immediate");

    switch (Cond[0].getImm()) {

    case SIInstrInfo::SCC_TRUE: {

      Register SReg = MRI.createVirtualRegister(BoolXExecRC);

      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);

      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

        .addImm(0)

        .addReg(FalseReg)

        .addImm(0)

        .addReg(TrueReg)

        .addReg(SReg);

      break;

    }

    case SIInstrInfo::SCC_FALSE: {

      Register SReg = MRI.createVirtualRegister(BoolXExecRC);

      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);

      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

        .addImm(0)

        .addReg(FalseReg)

        .addImm(0)

        .addReg(TrueReg)

        .addReg(SReg);

      break;

    }

    case SIInstrInfo::VCCNZ: {

      MachineOperand RegOp = Cond[1];

      RegOp.setImplicit(false);

      Register SReg = MRI.createVirtualRegister(BoolXExecRC);

      BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)

        .add(RegOp);

      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

          .addImm(0)

          .addReg(FalseReg)

          .addImm(0)

          .addReg(TrueReg)

          .addReg(SReg);

      break;

    }

    case SIInstrInfo::VCCZ: {

      MachineOperand RegOp = Cond[1];

      RegOp.setImplicit(false);

      Register SReg = MRI.createVirtualRegister(BoolXExecRC);

      BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)

        .add(RegOp);

      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

          .addImm(0)

          .addReg(TrueReg)

          .addImm(0)

          .addReg(FalseReg)

          .addReg(SReg);

      break;

    }

    case SIInstrInfo::EXECNZ: {

      Register SReg = MRI.createVirtualRegister(BoolXExecRC);

      Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());

      BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);

      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);

      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

        .addImm(0)

        .addReg(FalseReg)

        .addImm(0)

        .addReg(TrueReg)

        .addReg(SReg);

      break;

    }

    case SIInstrInfo::EXECZ: {

      Register SReg = MRI.createVirtualRegister(BoolXExecRC);

      Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());

      BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);

      BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);

      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

        .addImm(0)

        .addReg(FalseReg)

        .addImm(0)

        .addReg(TrueReg)

        .addReg(SReg);

      llvm_unreachable("Unhandled branch predicate EXECZ");

      break;

    }

    default:

      llvm_unreachable("invalid branch predicate");

    }

  } else {

    llvm_unreachable("Can only handle Cond size 1 or 2");

  }

}


Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,

                               MachineBasicBlock::iterator I,

                               const DebugLoc &DL,

                               Register SrcReg, int Value) const {

  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

  Register Reg = MRI.createVirtualRegister(RI.getBoolRC());

  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)

    .addImm(Value)

    .addReg(SrcReg);


  return Reg;

}


Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,

                               MachineBasicBlock::iterator I,

                               const DebugLoc &DL,

                               Register SrcReg, int Value) const {

  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

  Register Reg = MRI.createVirtualRegister(RI.getBoolRC());

  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)

    .addImm(Value)

    .addReg(SrcReg);


  return Reg;

}


bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,

                                          const Register Reg,

                                          int64_t &ImmVal) const {

  switch (MI.getOpcode()) {

  case AMDGPU::V_MOV_B32_e32:

  case AMDGPU::S_MOV_B32:

  case AMDGPU::S_MOVK_I32:

  case AMDGPU::S_MOV_B64:

  case AMDGPU::V_MOV_B64_e32:

  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:

  case AMDGPU::AV_MOV_B32_IMM_PSEUDO:

  case AMDGPU::AV_MOV_B64_IMM_PSEUDO:

  case AMDGPU::S_MOV_B64_IMM_PSEUDO:

  case AMDGPU::V_MOV_B64_PSEUDO: {

    const MachineOperand &Src0 = MI.getOperand(1);

    if (Src0.isImm()) {

      ImmVal = Src0.getImm();

      return MI.getOperand(0).getReg() == Reg;

    }


    return false;

  }

  case AMDGPU::S_BREV_B32:

  case AMDGPU::V_BFREV_B32_e32:

  case AMDGPU::V_BFREV_B32_e64: {

    const MachineOperand &Src0 = MI.getOperand(1);

    if (Src0.isImm()) {

      ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));

      return MI.getOperand(0).getReg() == Reg;

    }


    return false;

  }

  case AMDGPU::S_NOT_B32:

  case AMDGPU::V_NOT_B32_e32:

  case AMDGPU::V_NOT_B32_e64: {

    const MachineOperand &Src0 = MI.getOperand(1);

    if (Src0.isImm()) {

      ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));

      return MI.getOperand(0).getReg() == Reg;

    }


    return false;

  }

  default:

    return false;

  }

}


unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {


  if (RI.isAGPRClass(DstRC))

    return AMDGPU::COPY;

  if (RI.getRegSizeInBits(*DstRC) == 16) {

    // Assume hi bits are unneeded. Only _e64 true16 instructions are legal

    // before RA.

    return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;

  }

  if (RI.getRegSizeInBits(*DstRC) == 32)

    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;

  if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))

    return AMDGPU::S_MOV_B64;

  if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))

    return AMDGPU::V_MOV_B64_PSEUDO;

  return AMDGPU::COPY;

}


const MCInstrDesc &


SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,

                                     bool IsIndirectSrc) const {

  if (IsIndirectSrc) {

    if (VecSize <= 32) // 4 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);

    if (VecSize <= 64) // 8 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);

    if (VecSize <= 96) // 12 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);

    if (VecSize <= 128) // 16 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);

    if (VecSize <= 160) // 20 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);

    if (VecSize <= 256) // 32 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);

    if (VecSize <= 288) // 36 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);

    if (VecSize <= 320) // 40 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);

    if (VecSize <= 352) // 44 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);

    if (VecSize <= 384) // 48 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);

    if (VecSize <= 512) // 64 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);

    if (VecSize <= 1024) // 128 bytes

      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);


    llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");

  }


  if (VecSize <= 32) // 4 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);

  if (VecSize <= 64) // 8 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);

  if (VecSize <= 96) // 12 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);

  if (VecSize <= 128) // 16 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);

  if (VecSize <= 160) // 20 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);

  if (VecSize <= 256) // 32 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);

  if (VecSize <= 288) // 36 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);

  if (VecSize <= 320) // 40 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);

  if (VecSize <= 352) // 44 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);

  if (VecSize <= 384) // 48 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);

  if (VecSize <= 512) // 64 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);

  if (VecSize <= 1024) // 128 bytes

    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);


  llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");

}


static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {

  if (VecSize <= 32) // 4 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;

  if (VecSize <= 64) // 8 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;

  if (VecSize <= 96) // 12 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;

  if (VecSize <= 128) // 16 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;

  if (VecSize <= 160) // 20 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;

  if (VecSize <= 256) // 32 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;

  if (VecSize <= 288) // 36 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;

  if (VecSize <= 320) // 40 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;

  if (VecSize <= 352) // 44 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;

  if (VecSize <= 384) // 48 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;

  if (VecSize <= 512) // 64 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;

  if (VecSize <= 1024) // 128 bytes

    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;


  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");

}


static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {

  if (VecSize <= 32) // 4 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;

  if (VecSize <= 64) // 8 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;

  if (VecSize <= 96) // 12 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;

  if (VecSize <= 128) // 16 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;

  if (VecSize <= 160) // 20 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;

  if (VecSize <= 256) // 32 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;

  if (VecSize <= 288) // 36 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;

  if (VecSize <= 320) // 40 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;

  if (VecSize <= 352) // 44 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;

  if (VecSize <= 384) // 48 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;

  if (VecSize <= 512) // 64 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;

  if (VecSize <= 1024) // 128 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;


  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");

}


static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {

  if (VecSize <= 64) // 8 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;

  if (VecSize <= 128) // 16 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;

  if (VecSize <= 256) // 32 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;

  if (VecSize <= 512) // 64 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;

  if (VecSize <= 1024) // 128 bytes

    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;


  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");

}


const MCInstrDesc &


SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,

                                             bool IsSGPR) const {

  if (IsSGPR) {

    switch (EltSize) {

    case 32:

      return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));

    case 64:

      return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));

    default:

      llvm_unreachable("invalid reg indexing elt size");

    }

  }


  assert(EltSize == 32 && "invalid reg indexing elt size");

  return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));

}


static unsigned getSGPRSpillSaveOpcode(unsigned Size) {

  switch (Size) {

  case 4:

    return AMDGPU::SI_SPILL_S32_SAVE;

  case 8:

    return AMDGPU::SI_SPILL_S64_SAVE;

  case 12:

    return AMDGPU::SI_SPILL_S96_SAVE;

  case 16:

    return AMDGPU::SI_SPILL_S128_SAVE;

  case 20:

    return AMDGPU::SI_SPILL_S160_SAVE;

  case 24:

    return AMDGPU::SI_SPILL_S192_SAVE;

  case 28:

    return AMDGPU::SI_SPILL_S224_SAVE;

  case 32:

    return AMDGPU::SI_SPILL_S256_SAVE;

  case 36:

    return AMDGPU::SI_SPILL_S288_SAVE;

  case 40:

    return AMDGPU::SI_SPILL_S320_SAVE;

  case 44:

    return AMDGPU::SI_SPILL_S352_SAVE;

  case 48:

    return AMDGPU::SI_SPILL_S384_SAVE;

  case 64:

    return AMDGPU::SI_SPILL_S512_SAVE;

  case 128:

    return AMDGPU::SI_SPILL_S1024_SAVE;

  default:

    llvm_unreachable("unknown register size");

  }

}


static unsigned getVGPRSpillSaveOpcode(unsigned Size) {

  switch (Size) {

  case 2:

    return AMDGPU::SI_SPILL_V16_SAVE;

  case 4:

    return AMDGPU::SI_SPILL_V32_SAVE;

  case 8:

    return AMDGPU::SI_SPILL_V64_SAVE;

  case 12:

    return AMDGPU::SI_SPILL_V96_SAVE;

  case 16:

    return AMDGPU::SI_SPILL_V128_SAVE;

  case 20:

    return AMDGPU::SI_SPILL_V160_SAVE;

  case 24:

    return AMDGPU::SI_SPILL_V192_SAVE;

  case 28:

    return AMDGPU::SI_SPILL_V224_SAVE;

  case 32:

    return AMDGPU::SI_SPILL_V256_SAVE;

  case 36:

    return AMDGPU::SI_SPILL_V288_SAVE;

  case 40:

    return AMDGPU::SI_SPILL_V320_SAVE;

  case 44:

    return AMDGPU::SI_SPILL_V352_SAVE;

  case 48:

    return AMDGPU::SI_SPILL_V384_SAVE;

  case 64:

    return AMDGPU::SI_SPILL_V512_SAVE;

  case 128:

    return AMDGPU::SI_SPILL_V1024_SAVE;

  default:

    llvm_unreachable("unknown register size");

  }

}


static unsigned getAVSpillSaveOpcode(unsigned Size) {

  switch (Size) {

  case 4:

    return AMDGPU::SI_SPILL_AV32_SAVE;

  case 8:

    return AMDGPU::SI_SPILL_AV64_SAVE;

  case 12:

    return AMDGPU::SI_SPILL_AV96_SAVE;

  case 16:

    return AMDGPU::SI_SPILL_AV128_SAVE;

  case 20:

    return AMDGPU::SI_SPILL_AV160_SAVE;

  case 24:

    return AMDGPU::SI_SPILL_AV192_SAVE;

  case 28:

    return AMDGPU::SI_SPILL_AV224_SAVE;

  case 32:

    return AMDGPU::SI_SPILL_AV256_SAVE;

  case 36:

    return AMDGPU::SI_SPILL_AV288_SAVE;

  case 40:

    return AMDGPU::SI_SPILL_AV320_SAVE;

  case 44:

    return AMDGPU::SI_SPILL_AV352_SAVE;

  case 48:

    return AMDGPU::SI_SPILL_AV384_SAVE;

  case 64:

    return AMDGPU::SI_SPILL_AV512_SAVE;

  case 128:

    return AMDGPU::SI_SPILL_AV1024_SAVE;

  default:

    llvm_unreachable("unknown register size");

  }

}


static unsigned getWWMRegSpillSaveOpcode(unsigned Size,

                                         bool IsVectorSuperClass) {

  // Currently, there is only 32-bit WWM register spills needed.

  if (Size != 4)

    llvm_unreachable("unknown wwm register spill size");


  if (IsVectorSuperClass)

    return AMDGPU::SI_SPILL_WWM_AV32_SAVE;


  return AMDGPU::SI_SPILL_WWM_V32_SAVE;

}


unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(

    Register Reg, const TargetRegisterClass *RC, unsigned Size,

    const SIMachineFunctionInfo &MFI) const {

  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);


  // Choose the right opcode if spilling a WWM register.

  if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))

    return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);


  // TODO: Check if AGPRs are available

  if (ST.hasMAIInsts())

    return getAVSpillSaveOpcode(Size);


  return getVGPRSpillSaveOpcode(Size);

}


void SIInstrInfo::storeRegToStackSlot(

    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,

    bool isKill, int FrameIndex, const TargetRegisterClass *RC,

    const TargetRegisterInfo *TRI, Register VReg,

    MachineInstr::MIFlag Flags) const {

  MachineFunction *MF = MBB.getParent();

  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

  MachineFrameInfo &FrameInfo = MF->getFrameInfo();

  const DebugLoc &DL = MBB.findDebugLoc(MI);


  MachinePointerInfo PtrInfo

    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);

  MachineMemOperand *MMO = MF->getMachineMemOperand(

      PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),

      FrameInfo.getObjectAlign(FrameIndex));

  unsigned SpillSize = TRI->getSpillSize(*RC);


  MachineRegisterInfo &MRI = MF->getRegInfo();

  if (RI.isSGPRClass(RC)) {

    MFI->setHasSpilledSGPRs();

    assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");

    assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&

           SrcReg != AMDGPU::EXEC && "exec should not be spilled");


    // We are only allowed to create one new instruction when spilling

    // registers, so we need to use pseudo instruction for spilling SGPRs.

    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));


    // The SGPR spill/restore instructions only work on number sgprs, so we need

    // to make sure we are using the correct register class.

    if (SrcReg.isVirtual() && SpillSize == 4) {

      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);

    }


    BuildMI(MBB, MI, DL, OpDesc)

      .addReg(SrcReg, getKillRegState(isKill)) // data

      .addFrameIndex(FrameIndex)               // addr

      .addMemOperand(MMO)

      .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);


    if (RI.spillSGPRToVGPR())

      FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);

    return;

  }


  unsigned Opcode =

      getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);

  MFI->setHasSpilledVGPRs();


  BuildMI(MBB, MI, DL, get(Opcode))

    .addReg(SrcReg, getKillRegState(isKill)) // data

    .addFrameIndex(FrameIndex)               // addr

    .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset

    .addImm(0)                               // offset

    .addMemOperand(MMO);

}


static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {

  switch (Size) {

  case 4:

    return AMDGPU::SI_SPILL_S32_RESTORE;

  case 8:

    return AMDGPU::SI_SPILL_S64_RESTORE;

  case 12:

    return AMDGPU::SI_SPILL_S96_RESTORE;

  case 16:

    return AMDGPU::SI_SPILL_S128_RESTORE;

  case 20:

    return AMDGPU::SI_SPILL_S160_RESTORE;

  case 24:

    return AMDGPU::SI_SPILL_S192_RESTORE;

  case 28:

    return AMDGPU::SI_SPILL_S224_RESTORE;

  case 32:

    return AMDGPU::SI_SPILL_S256_RESTORE;

  case 36:

    return AMDGPU::SI_SPILL_S288_RESTORE;

  case 40:

    return AMDGPU::SI_SPILL_S320_RESTORE;

  case 44:

    return AMDGPU::SI_SPILL_S352_RESTORE;

  case 48:

    return AMDGPU::SI_SPILL_S384_RESTORE;

  case 64:

    return AMDGPU::SI_SPILL_S512_RESTORE;

  case 128:

    return AMDGPU::SI_SPILL_S1024_RESTORE;

  default:

    llvm_unreachable("unknown register size");

  }

}


static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {

  switch (Size) {

  case 2:

    return AMDGPU::SI_SPILL_V16_RESTORE;

  case 4:

    return AMDGPU::SI_SPILL_V32_RESTORE;

  case 8:

    return AMDGPU::SI_SPILL_V64_RESTORE;

  case 12:

    return AMDGPU::SI_SPILL_V96_RESTORE;

  case 16:

    return AMDGPU::SI_SPILL_V128_RESTORE;

  case 20:

    return AMDGPU::SI_SPILL_V160_RESTORE;

  case 24:

    return AMDGPU::SI_SPILL_V192_RESTORE;

  case 28:

    return AMDGPU::SI_SPILL_V224_RESTORE;

  case 32:

    return AMDGPU::SI_SPILL_V256_RESTORE;

  case 36:

    return AMDGPU::SI_SPILL_V288_RESTORE;

  case 40:

    return AMDGPU::SI_SPILL_V320_RESTORE;

  case 44:

    return AMDGPU::SI_SPILL_V352_RESTORE;

  case 48:

    return AMDGPU::SI_SPILL_V384_RESTORE;

  case 64:

    return AMDGPU::SI_SPILL_V512_RESTORE;

  case 128:

    return AMDGPU::SI_SPILL_V1024_RESTORE;

  default:

    llvm_unreachable("unknown register size");

  }

}


static unsigned getAVSpillRestoreOpcode(unsigned Size) {

  switch (Size) {

  case 4:

    return AMDGPU::SI_SPILL_AV32_RESTORE;

  case 8:

    return AMDGPU::SI_SPILL_AV64_RESTORE;

  case 12:

    return AMDGPU::SI_SPILL_AV96_RESTORE;

  case 16:

    return AMDGPU::SI_SPILL_AV128_RESTORE;

  case 20:

    return AMDGPU::SI_SPILL_AV160_RESTORE;

  case 24:

    return AMDGPU::SI_SPILL_AV192_RESTORE;

  case 28:

    return AMDGPU::SI_SPILL_AV224_RESTORE;

  case 32:

    return AMDGPU::SI_SPILL_AV256_RESTORE;

  case 36:

    return AMDGPU::SI_SPILL_AV288_RESTORE;

  case 40:

    return AMDGPU::SI_SPILL_AV320_RESTORE;

  case 44:

    return AMDGPU::SI_SPILL_AV352_RESTORE;

  case 48:

    return AMDGPU::SI_SPILL_AV384_RESTORE;

  case 64:

    return AMDGPU::SI_SPILL_AV512_RESTORE;

  case 128:

    return AMDGPU::SI_SPILL_AV1024_RESTORE;

  default:

    llvm_unreachable("unknown register size");

  }

}


static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,

                                            bool IsVectorSuperClass) {

  // Currently, there is only 32-bit WWM register spills needed.

  if (Size != 4)

    llvm_unreachable("unknown wwm register spill size");


  if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs

    return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;


  return AMDGPU::SI_SPILL_WWM_V32_RESTORE;

}


unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(

    Register Reg, const TargetRegisterClass *RC, unsigned Size,

    const SIMachineFunctionInfo &MFI) const {

  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);


  // Choose the right opcode if restoring a WWM register.

  if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))

    return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);


  // TODO: Check if AGPRs are available

  if (ST.hasMAIInsts())

    return getAVSpillRestoreOpcode(Size);


  assert(!RI.isAGPRClass(RC));

  return getVGPRSpillRestoreOpcode(Size);

}


void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,

                                       MachineBasicBlock::iterator MI,

                                       Register DestReg, int FrameIndex,

                                       const TargetRegisterClass *RC,

                                       const TargetRegisterInfo *TRI,

                                       Register VReg,

                                       MachineInstr::MIFlag Flags) const {

  MachineFunction *MF = MBB.getParent();

  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

  MachineFrameInfo &FrameInfo = MF->getFrameInfo();

  const DebugLoc &DL = MBB.findDebugLoc(MI);

  unsigned SpillSize = TRI->getSpillSize(*RC);


  MachinePointerInfo PtrInfo

    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);


  MachineMemOperand *MMO = MF->getMachineMemOperand(

      PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),

      FrameInfo.getObjectAlign(FrameIndex));


  if (RI.isSGPRClass(RC)) {

    MFI->setHasSpilledSGPRs();

    assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");

    assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&

           DestReg != AMDGPU::EXEC && "exec should not be spilled");


    // FIXME: Maybe this should not include a memoperand because it will be

    // lowered to non-memory instructions.

    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));

    if (DestReg.isVirtual() && SpillSize == 4) {

      MachineRegisterInfo &MRI = MF->getRegInfo();

      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);

    }


    if (RI.spillSGPRToVGPR())

      FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);

    BuildMI(MBB, MI, DL, OpDesc, DestReg)

      .addFrameIndex(FrameIndex) // addr

      .addMemOperand(MMO)

      .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);


    return;

  }


  unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,

                                                   SpillSize, *MFI);

  BuildMI(MBB, MI, DL, get(Opcode), DestReg)

      .addFrameIndex(FrameIndex)           // vaddr

      .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset

      .addImm(0)                           // offset

      .addMemOperand(MMO);

}


void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,

                             MachineBasicBlock::iterator MI) const {

  insertNoops(MBB, MI, 1);

}


void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,

                              MachineBasicBlock::iterator MI,

                              unsigned Quantity) const {

  DebugLoc DL = MBB.findDebugLoc(MI);

  while (Quantity > 0) {

    unsigned Arg = std::min(Quantity, 8u);

    Quantity -= Arg;

    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);

  }

}


void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {

  auto *MF = MBB.getParent();

  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();


  assert(Info->isEntryFunction());


  if (MBB.succ_empty()) {

    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();

    if (HasNoTerminator) {

      if (Info->returnsVoid()) {

        BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);

      } else {

        BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));

      }

    }

  }

}


MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,

                                                    MachineBasicBlock &MBB,

                                                    MachineInstr &MI,

                                                    const DebugLoc &DL) const {

  MachineFunction *MF = MBB.getParent();

  constexpr unsigned DoorbellIDMask = 0x3ff;

  constexpr unsigned ECQueueWaveAbort = 0x400;


  MachineBasicBlock *TrapBB = &MBB;

  MachineBasicBlock *ContBB = &MBB;

  MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();


  if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {

    ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);

    TrapBB = MF->CreateMachineBasicBlock();

    BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);

    MF->push_back(TrapBB);

    MBB.addSuccessor(TrapBB);

  }


  // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this

  // will be a nop.

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))

      .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));

  Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),

          DoorbellReg)

      .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)

      .addUse(AMDGPU::M0);

  Register DoorbellRegMasked =

      MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)

      .addUse(DoorbellReg)

      .addImm(DoorbellIDMask);

  Register SetWaveAbortBit =

      MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)

      .addUse(DoorbellRegMasked)

      .addImm(ECQueueWaveAbort);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)

      .addUse(SetWaveAbortBit);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))

      .addImm(AMDGPU::SendMsg::ID_INTERRUPT);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)

      .addUse(AMDGPU::TTMP2);

  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);

  TrapBB->addSuccessor(HaltLoopBB);


  BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);

  BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))

      .addMBB(HaltLoopBB);

  MF->push_back(HaltLoopBB);

  HaltLoopBB->addSuccessor(HaltLoopBB);


  return ContBB;

}


unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {

  switch (MI.getOpcode()) {

  default:

    if (MI.isMetaInstruction())

      return 0;

    return 1; // FIXME: Do wait states equal cycles?


  case AMDGPU::S_NOP:

    return MI.getOperand(0).getImm() + 1;

  // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The

  // hazard, even if one exist, won't really be visible. Should we handle it?

  }

}


bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {

  MachineBasicBlock &MBB = *MI.getParent();

  DebugLoc DL = MBB.findDebugLoc(MI);

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  switch (MI.getOpcode()) {

  default: return TargetInstrInfo::expandPostRAPseudo(MI);

  case AMDGPU::S_MOV_B64_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_MOV_B64));

    break;


  case AMDGPU::S_MOV_B32_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_MOV_B32));

    break;


  case AMDGPU::S_XOR_B64_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_XOR_B64));

    break;


  case AMDGPU::S_XOR_B32_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_XOR_B32));

    break;

  case AMDGPU::S_OR_B64_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_OR_B64));

    break;

  case AMDGPU::S_OR_B32_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_OR_B32));

    break;


  case AMDGPU::S_ANDN2_B64_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_ANDN2_B64));

    break;


  case AMDGPU::S_ANDN2_B32_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_ANDN2_B32));

    break;


  case AMDGPU::S_AND_B64_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_AND_B64));

    break;


  case AMDGPU::S_AND_B32_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_AND_B32));

    break;


  case AMDGPU::S_AND_SAVEEXEC_B64_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));

    break;


  case AMDGPU::S_AND_SAVEEXEC_B32_term:

    // This is only a terminator to get the correct spill code placement during

    // register allocation.

    MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));

    break;


  case AMDGPU::SI_SPILL_S32_TO_VGPR:

    MI.setDesc(get(AMDGPU::V_WRITELANE_B32));

    break;


  case AMDGPU::SI_RESTORE_S32_FROM_VGPR:

    MI.setDesc(get(AMDGPU::V_READLANE_B32));

    MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),

                                               &AMDGPU::SReg_32_XM0RegClass);

    break;

  case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {

    Register Dst = MI.getOperand(0).getReg();

    bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));

    MI.setDesc(

        get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));

    break;

  }

  case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {

    Register Dst = MI.getOperand(0).getReg();

    if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {

      int64_t Imm = MI.getOperand(1).getImm();


      Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);

      Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);

      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)

          .addImm(SignExtend64<32>(Imm))

          .addReg(Dst, RegState::Implicit | RegState::Define);

      BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)

          .addImm(SignExtend64<32>(Imm >> 32))

          .addReg(Dst, RegState::Implicit | RegState::Define);

      MI.eraseFromParent();

      break;

    }


    [[fallthrough]];

  }

  case AMDGPU::V_MOV_B64_PSEUDO: {

    Register Dst = MI.getOperand(0).getReg();

    Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);

    Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);


    const MachineOperand &SrcOp = MI.getOperand(1);

    // FIXME: Will this work for 64-bit floating point immediates?

    assert(!SrcOp.isFPImm());

    if (ST.hasMovB64()) {

      MI.setDesc(get(AMDGPU::V_MOV_B64_e32));

      if (SrcOp.isReg() || isInlineConstant(MI, 1) ||

          isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())

        break;

    }

    if (SrcOp.isImm()) {

      APInt Imm(64, SrcOp.getImm());

      APInt Lo(32, Imm.getLoBits(32).getZExtValue());

      APInt Hi(32, Imm.getHiBits(32).getZExtValue());

      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {

        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)

          .addImm(SISrcMods::OP_SEL_1)

          .addImm(Lo.getSExtValue())

          .addImm(SISrcMods::OP_SEL_1)

          .addImm(Lo.getSExtValue())

          .addImm(0)  // op_sel_lo

          .addImm(0)  // op_sel_hi

          .addImm(0)  // neg_lo

          .addImm(0)  // neg_hi

          .addImm(0); // clamp

      } else {

        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)

          .addImm(Lo.getSExtValue())

          .addReg(Dst, RegState::Implicit | RegState::Define);

        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)

          .addImm(Hi.getSExtValue())

          .addReg(Dst, RegState::Implicit | RegState::Define);

      }

    } else {

      assert(SrcOp.isReg());

      if (ST.hasPkMovB32() &&

          !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {

        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)

          .addImm(SISrcMods::OP_SEL_1) // src0_mod

          .addReg(SrcOp.getReg())

          .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod

          .addReg(SrcOp.getReg())

          .addImm(0)  // op_sel_lo

          .addImm(0)  // op_sel_hi

          .addImm(0)  // neg_lo

          .addImm(0)  // neg_hi

          .addImm(0); // clamp

      } else {

        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)

          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))

          .addReg(Dst, RegState::Implicit | RegState::Define);

        BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)

          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))

          .addReg(Dst, RegState::Implicit | RegState::Define);

      }

    }

    MI.eraseFromParent();

    break;

  }

  case AMDGPU::V_MOV_B64_DPP_PSEUDO: {

    expandMovDPP64(MI);

    break;

  }

  case AMDGPU::S_MOV_B64_IMM_PSEUDO: {

    const MachineOperand &SrcOp = MI.getOperand(1);

    assert(!SrcOp.isFPImm());


    if (ST.has64BitLiterals()) {

      MI.setDesc(get(AMDGPU::S_MOV_B64));

      break;

    }


    APInt Imm(64, SrcOp.getImm());

    if (Imm.isIntN(32) || isInlineConstant(Imm)) {

      MI.setDesc(get(AMDGPU::S_MOV_B64));

      break;

    }


    Register Dst = MI.getOperand(0).getReg();

    Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);

    Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);


    APInt Lo(32, Imm.getLoBits(32).getZExtValue());

    APInt Hi(32, Imm.getHiBits(32).getZExtValue());

    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)

      .addImm(Lo.getSExtValue())

      .addReg(Dst, RegState::Implicit | RegState::Define);

    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)

      .addImm(Hi.getSExtValue())

      .addReg(Dst, RegState::Implicit | RegState::Define);

    MI.eraseFromParent();

    break;

  }

  case AMDGPU::V_SET_INACTIVE_B32: {

    // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.

    Register DstReg = MI.getOperand(0).getReg();

    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

        .add(MI.getOperand(3))

        .add(MI.getOperand(4))

        .add(MI.getOperand(1))

        .add(MI.getOperand(2))

        .add(MI.getOperand(5));

    MI.eraseFromParent();

    break;

  }

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:

  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:

  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {

    const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);


    unsigned Opc;

    if (RI.hasVGPRs(EltRC)) {

      Opc = AMDGPU::V_MOVRELD_B32_e32;

    } else {

      Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64

                                              : AMDGPU::S_MOVRELD_B32;

    }


    const MCInstrDesc &OpDesc = get(Opc);

    Register VecReg = MI.getOperand(0).getReg();

    bool IsUndef = MI.getOperand(1).isUndef();

    unsigned SubReg = MI.getOperand(3).getImm();

    assert(VecReg == MI.getOperand(1).getReg());


    MachineInstrBuilder MIB =

      BuildMI(MBB, MI, DL, OpDesc)

        .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)

        .add(MI.getOperand(2))

        .addReg(VecReg, RegState::ImplicitDefine)

        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));


    const int ImpDefIdx =

        OpDesc.getNumOperands() + OpDesc.implicit_uses().size();

    const int ImpUseIdx = ImpDefIdx + 1;

    MIB->tieOperands(ImpDefIdx, ImpUseIdx);

    MI.eraseFromParent();

    break;

  }

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:

  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {

    assert(ST.useVGPRIndexMode());

    Register VecReg = MI.getOperand(0).getReg();

    bool IsUndef = MI.getOperand(1).isUndef();

    MachineOperand &Idx = MI.getOperand(3);

    Register SubReg = MI.getOperand(4).getImm();


    MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))

                              .add(Idx)

                              .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);

    SetOn->getOperand(3).setIsUndef();


    const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);

    MachineInstrBuilder MIB =

        BuildMI(MBB, MI, DL, OpDesc)

            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)

            .add(MI.getOperand(2))

            .addReg(VecReg, RegState::ImplicitDefine)

            .addReg(VecReg,

                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));


    const int ImpDefIdx =

        OpDesc.getNumOperands() + OpDesc.implicit_uses().size();

    const int ImpUseIdx = ImpDefIdx + 1;

    MIB->tieOperands(ImpDefIdx, ImpUseIdx);


    MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));


    finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));


    MI.eraseFromParent();

    break;

  }

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:

  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {

    assert(ST.useVGPRIndexMode());

    Register Dst = MI.getOperand(0).getReg();

    Register VecReg = MI.getOperand(1).getReg();

    bool IsUndef = MI.getOperand(1).isUndef();

    Register Idx = MI.getOperand(2).getReg();

    Register SubReg = MI.getOperand(3).getImm();


    MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))

                              .addReg(Idx)

                              .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);

    SetOn->getOperand(3).setIsUndef();


    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))

        .addDef(Dst)

        .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)

        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));


    MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));


    finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));


    MI.eraseFromParent();

    break;

  }

  case AMDGPU::SI_PC_ADD_REL_OFFSET: {

    MachineFunction &MF = *MBB.getParent();

    Register Reg = MI.getOperand(0).getReg();

    Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);

    Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);

    MachineOperand OpLo = MI.getOperand(1);

    MachineOperand OpHi = MI.getOperand(2);


    // Create a bundle so these instructions won't be re-ordered by the

    // post-RA scheduler.

    MIBundleBuilder Bundler(MBB, MI);

    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));


    // What we want here is an offset from the value returned by s_getpc (which

    // is the address of the s_add_u32 instruction) to the global variable, but

    // since the encoding of $symbol starts 4 bytes after the start of the

    // s_add_u32 instruction, we end up with an offset that is 4 bytes too

    // small. This requires us to add 4 to the global variable offset in order

    // to compute the correct address. Similarly for the s_addc_u32 instruction,

    // the encoding of $symbol starts 12 bytes after the start of the s_add_u32

    // instruction.


    int64_t Adjust = 0;

    if (ST.hasGetPCZeroExtension()) {

      // Fix up hardware that does not sign-extend the 48-bit PC value by

      // inserting: s_sext_i32_i16 reghi, reghi

      Bundler.append(

          BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));

      Adjust += 4;

    }


    if (OpLo.isGlobal())

      OpLo.setOffset(OpLo.getOffset() + Adjust + 4);

    Bundler.append(

        BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));


    if (OpHi.isGlobal())

      OpHi.setOffset(OpHi.getOffset() + Adjust + 12);

    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)

                       .addReg(RegHi)

                       .add(OpHi));


    finalizeBundle(MBB, Bundler.begin());


    MI.eraseFromParent();

    break;

  }

  case AMDGPU::SI_PC_ADD_REL_OFFSET64: {

    MachineFunction &MF = *MBB.getParent();

    Register Reg = MI.getOperand(0).getReg();

    MachineOperand Op = MI.getOperand(1);


    // Create a bundle so these instructions won't be re-ordered by the

    // post-RA scheduler.

    MIBundleBuilder Bundler(MBB, MI);

    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));

    if (Op.isGlobal())

      Op.setOffset(Op.getOffset() + 4);

    Bundler.append(

        BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));


    finalizeBundle(MBB, Bundler.begin());


    MI.eraseFromParent();

    break;

  }

  case AMDGPU::ENTER_STRICT_WWM: {

    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when

    // Whole Wave Mode is entered.

    MI.setDesc(get(LMC.OrSaveExecOpc));

    break;

  }

  case AMDGPU::ENTER_STRICT_WQM: {

    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when

    // STRICT_WQM is entered.

    BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())

        .addReg(LMC.ExecReg);

    BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);


    MI.eraseFromParent();

    break;

  }

  case AMDGPU::EXIT_STRICT_WWM:

  case AMDGPU::EXIT_STRICT_WQM: {

    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when

    // WWM/STICT_WQM is exited.

    MI.setDesc(get(LMC.MovOpc));

    break;

  }

  case AMDGPU::SI_RETURN: {

    const MachineFunction *MF = MBB.getParent();

    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

    const SIRegisterInfo *TRI = ST.getRegisterInfo();

    // Hiding the return address use with SI_RETURN may lead to extra kills in

    // the function and missing live-ins. We are fine in practice because callee

    // saved register handling ensures the register value is restored before

    // RET, but we need the undef flag here to appease the MachineVerifier

    // liveness checks.

    MachineInstrBuilder MIB =

        BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))

            .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);


    MIB.copyImplicitOps(MI);

    MI.eraseFromParent();

    break;

  }


  case AMDGPU::S_MUL_U64_U32_PSEUDO:

  case AMDGPU::S_MUL_I64_I32_PSEUDO:

    MI.setDesc(get(AMDGPU::S_MUL_U64));

    break;


  case AMDGPU::S_GETPC_B64_pseudo:

    MI.setDesc(get(AMDGPU::S_GETPC_B64));

    if (ST.hasGetPCZeroExtension()) {

      Register Dst = MI.getOperand(0).getReg();

      Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);

      // Fix up hardware that does not sign-extend the 48-bit PC value by

      // inserting: s_sext_i32_i16 dsthi, dsthi

      BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),

              DstHi)

          .addReg(DstHi);

    }

    break;


  case AMDGPU::V_MAX_BF16_PSEUDO_e64:

    assert(ST.hasBF16PackedInsts());

    MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));

    MI.addOperand(MachineOperand::CreateImm(0)); // op_sel

    MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo

    MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi

    auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);

    Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);

    auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);

    Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);

    break;

  }


  return true;

}


void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,

                                MachineBasicBlock::iterator I, Register DestReg,

                                unsigned SubIdx, const MachineInstr &Orig,

                                const TargetRegisterInfo &RI) const {


  // Try shrinking the instruction to remat only the part needed for current

  // context.

  // TODO: Handle more cases.

  unsigned Opcode = Orig.getOpcode();

  switch (Opcode) {

  case AMDGPU::S_LOAD_DWORDX16_IMM:

  case AMDGPU::S_LOAD_DWORDX8_IMM: {

    if (SubIdx != 0)

      break;


    if (I == MBB.end())

      break;


    if (I->isBundled())

      break;


    // Look for a single use of the register that is also a subreg.

    Register RegToFind = Orig.getOperand(0).getReg();

    MachineOperand *UseMO = nullptr;

    for (auto &CandMO : I->operands()) {

      if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())

        continue;

      if (UseMO) {

        UseMO = nullptr;

        break;

      }

      UseMO = &CandMO;

    }

    if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)

      break;


    unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());

    unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());


    MachineFunction *MF = MBB.getParent();

    MachineRegisterInfo &MRI = MF->getRegInfo();

    assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");


    unsigned NewOpcode = -1;

    if (SubregSize == 256)

      NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;

    else if (SubregSize == 128)

      NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;

    else

      break;


    const MCInstrDesc &TID = get(NewOpcode);

    const TargetRegisterClass *NewRC =

        RI.getAllocatableClass(getRegClass(TID, 0, &RI));

    MRI.setRegClass(DestReg, NewRC);


    UseMO->setReg(DestReg);

    UseMO->setSubReg(AMDGPU::NoSubRegister);


    // Use a smaller load with the desired size, possibly with updated offset.

    MachineInstr *MI = MF->CloneMachineInstr(&Orig);

    MI->setDesc(TID);

    MI->getOperand(0).setReg(DestReg);

    MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);

    if (Offset) {

      MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);

      int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;

      OffsetMO->setImm(FinalOffset);

    }

    SmallVector<MachineMemOperand *> NewMMOs;

    for (const MachineMemOperand *MemOp : Orig.memoperands())

      NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),

                                                 SubregSize / 8));

    MI->setMemRefs(*MF, NewMMOs);


    MBB.insert(I, MI);

    return;

  }


  default:

    break;

  }


  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);

}


std::pair<MachineInstr*, MachineInstr*>


SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {

  assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);


  if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&

      AMDGPU::isLegalDPALU_DPPControl(

          ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {

    MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));

    return std::pair(&MI, nullptr);

  }


  MachineBasicBlock &MBB = *MI.getParent();

  DebugLoc DL = MBB.findDebugLoc(MI);

  MachineFunction *MF = MBB.getParent();

  MachineRegisterInfo &MRI = MF->getRegInfo();

  Register Dst = MI.getOperand(0).getReg();

  unsigned Part = 0;

  MachineInstr *Split[2];


  for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {

    auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));

    if (Dst.isPhysical()) {

      MovDPP.addDef(RI.getSubReg(Dst, Sub));

    } else {

      assert(MRI.isSSA());

      auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

      MovDPP.addDef(Tmp);

    }


    for (unsigned I = 1; I <= 2; ++I) { // old and src operands.

      const MachineOperand &SrcOp = MI.getOperand(I);

      assert(!SrcOp.isFPImm());

      if (SrcOp.isImm()) {

        APInt Imm(64, SrcOp.getImm());

        Imm.ashrInPlace(Part * 32);

        MovDPP.addImm(Imm.getLoBits(32).getZExtValue());

      } else {

        assert(SrcOp.isReg());

        Register Src = SrcOp.getReg();

        if (Src.isPhysical())

          MovDPP.addReg(RI.getSubReg(Src, Sub));

        else

          MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);

      }

    }


    for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))

      MovDPP.addImm(MO.getImm());


    Split[Part] = MovDPP;

    ++Part;

  }


  if (Dst.isVirtual())

    BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)

      .addReg(Split[0]->getOperand(0).getReg())

      .addImm(AMDGPU::sub0)

      .addReg(Split[1]->getOperand(0).getReg())

      .addImm(AMDGPU::sub1);


  MI.eraseFromParent();

  return std::pair(Split[0], Split[1]);

}


std::optional<DestSourcePair>


SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {

  if (MI.getOpcode() == AMDGPU::WWM_COPY)

    return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};


  return std::nullopt;

}


bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0,

                                      AMDGPU::OpName Src0OpName,

                                      MachineOperand &Src1,

                                      AMDGPU::OpName Src1OpName) const {

  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);

  if (!Src0Mods)

    return false;


  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);

  assert(Src1Mods &&

         "All commutable instructions have both src0 and src1 modifiers");


  int Src0ModsVal = Src0Mods->getImm();

  int Src1ModsVal = Src1Mods->getImm();


  Src1Mods->setImm(Src0ModsVal);

  Src0Mods->setImm(Src1ModsVal);

  return true;

}


static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,

                                             MachineOperand &RegOp,

                                             MachineOperand &NonRegOp) {

  Register Reg = RegOp.getReg();

  unsigned SubReg = RegOp.getSubReg();

  bool IsKill = RegOp.isKill();

  bool IsDead = RegOp.isDead();

  bool IsUndef = RegOp.isUndef();

  bool IsDebug = RegOp.isDebug();


  if (NonRegOp.isImm())

    RegOp.ChangeToImmediate(NonRegOp.getImm());

  else if (NonRegOp.isFI())

    RegOp.ChangeToFrameIndex(NonRegOp.getIndex());

  else if (NonRegOp.isGlobal()) {

    RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),

                     NonRegOp.getTargetFlags());

  } else

    return nullptr;


  // Make sure we don't reinterpret a subreg index in the target flags.

  RegOp.setTargetFlags(NonRegOp.getTargetFlags());


  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);

  NonRegOp.setSubReg(SubReg);


  return &MI;

}


static MachineInstr *swapImmOperands(MachineInstr &MI,

                                     MachineOperand &NonRegOp1,

                                     MachineOperand &NonRegOp2) {

  unsigned TargetFlags = NonRegOp1.getTargetFlags();

  int64_t NonRegVal = NonRegOp1.getImm();


  NonRegOp1.setImm(NonRegOp2.getImm());

  NonRegOp2.setImm(NonRegVal);

  NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());

  NonRegOp2.setTargetFlags(TargetFlags);

  return &MI;

}


bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,

                                unsigned OpIdx1) const {

  const MCInstrDesc &InstDesc = MI.getDesc();

  const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];

  const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];


  unsigned Opc = MI.getOpcode();

  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);


  const MachineOperand &MO0 = MI.getOperand(OpIdx0);

  const MachineOperand &MO1 = MI.getOperand(OpIdx1);


  // Swap doesn't breach constant bus or literal limits

  // It may move literal to position other than src0, this is not allowed

  // pre-gfx10 However, most test cases need literals in Src0 for VOP

  // FIXME: After gfx9, literal can be in place other than Src0

  if (isVALU(MI)) {

    if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&

        !isInlineConstant(MO0, OpInfo1))

      return false;

    if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&

        !isInlineConstant(MO1, OpInfo0))

      return false;

  }


  if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {

    if (OpInfo1.RegClass == -1)

      return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;

    return isLegalRegOperand(MI, OpIdx1, MO0) &&

           (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));

  }

  if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {

    if (OpInfo0.RegClass == -1)

      return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;

    return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&

           isLegalRegOperand(MI, OpIdx0, MO1);

  }


  // No need to check 64-bit literals since swapping does not bring new

  // 64-bit literals into current instruction to fold to 32-bit


  return isImmOperandLegal(MI, OpIdx1, MO0);

}


MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,

                                                  unsigned Src0Idx,

                                                  unsigned Src1Idx) const {

  assert(!NewMI && "this should never be used");


  unsigned Opc = MI.getOpcode();

  int CommutedOpcode = commuteOpcode(Opc);

  if (CommutedOpcode == -1)

    return nullptr;


  if (Src0Idx > Src1Idx)

    std::swap(Src0Idx, Src1Idx);


  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==

           static_cast<int>(Src0Idx) &&

         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==

           static_cast<int>(Src1Idx) &&

         "inconsistency with findCommutedOpIndices");


  if (!isLegalToSwap(MI, Src0Idx, Src1Idx))

    return nullptr;


  MachineInstr *CommutedMI = nullptr;

  MachineOperand &Src0 = MI.getOperand(Src0Idx);

  MachineOperand &Src1 = MI.getOperand(Src1Idx);

  if (Src0.isReg() && Src1.isReg()) {

    // Be sure to copy the source modifiers to the right place.

    CommutedMI =

        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);

  } else if (Src0.isReg() && !Src1.isReg()) {

    CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);

  } else if (!Src0.isReg() && Src1.isReg()) {

    CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);

  } else if (Src0.isImm() && Src1.isImm()) {

    CommutedMI = swapImmOperands(MI, Src0, Src1);

  } else {

    // FIXME: Found two non registers to commute. This does happen.

    return nullptr;

  }


  if (CommutedMI) {

    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,

                        Src1, AMDGPU::OpName::src1_modifiers);


    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,

                        AMDGPU::OpName::src1_sel);


    CommutedMI->setDesc(get(CommutedOpcode));

  }


  return CommutedMI;

}


// This needs to be implemented because the source modifiers may be inserted

// between the true commutable operands, and the base

// TargetInstrInfo::commuteInstruction uses it.


bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,

                                        unsigned &SrcOpIdx0,

                                        unsigned &SrcOpIdx1) const {

  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);

}


bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,

                                        unsigned &SrcOpIdx0,

                                        unsigned &SrcOpIdx1) const {

  if (!Desc.isCommutable())

    return false;


  unsigned Opc = Desc.getOpcode();

  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);

  if (Src0Idx == -1)

    return false;


  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);

  if (Src1Idx == -1)

    return false;


  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);

}


bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,

                                        int64_t BrOffset) const {

  // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64

  // because its dest block is unanalyzable.

  assert(isSOPP(BranchOp) || isSOPK(BranchOp));


  // Convert to dwords.

  BrOffset /= 4;


  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is

  // from the next instruction.

  BrOffset -= 1;


  return isIntN(BranchOffsetBits, BrOffset);

}


MachineBasicBlock *


SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {

  return MI.getOperand(0).getMBB();

}


bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const {

  for (const MachineInstr &MI : MBB->terminators()) {

    if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||

        MI.getOpcode() == AMDGPU::SI_LOOP)

      return true;

  }

  return false;

}


void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,

                                       MachineBasicBlock &DestBB,

                                       MachineBasicBlock &RestoreBB,

                                       const DebugLoc &DL, int64_t BrOffset,

                                       RegScavenger *RS) const {

  assert(MBB.empty() &&

         "new block should be inserted for expanding unconditional branch");

  assert(MBB.pred_size() == 1);

  assert(RestoreBB.empty() &&

         "restore block should be inserted for restoring clobbered registers");


  MachineFunction *MF = MBB.getParent();

  MachineRegisterInfo &MRI = MF->getRegInfo();

  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

  auto I = MBB.end();

  auto &MCCtx = MF->getContext();


  if (ST.hasAddPC64Inst()) {

    MCSymbol *Offset =

        MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);

    auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))

                     .addSym(Offset, MO_FAR_BRANCH_OFFSET);

    MCSymbol *PostAddPCLabel =

        MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);

    AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);

    auto *OffsetExpr = MCBinaryExpr::createSub(

        MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),

        MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);

    Offset->setVariableValue(OffsetExpr);

    return;

  }


  assert(RS && "RegScavenger required for long branching");


  // FIXME: Virtual register workaround for RegScavenger not working with empty

  // blocks.

  Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);


  // Note: as this is used after hazard recognizer we need to apply some hazard

  // workarounds directly.

  const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||

                               ST.hasVALUReadSGPRHazard();

  auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {

    if (FlushSGPRWrites)

      BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))

          .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));

  };


  // We need to compute the offset relative to the instruction immediately after

  // s_getpc_b64. Insert pc arithmetic code before last terminator.

  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);

  ApplyHazardWorkarounds();


  MCSymbol *PostGetPCLabel =

      MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);

  GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);


  MCSymbol *OffsetLo =

      MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);

  MCSymbol *OffsetHi =

      MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);

  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))

      .addReg(PCReg, RegState::Define, AMDGPU::sub0)

      .addReg(PCReg, 0, AMDGPU::sub0)

      .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);

  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))

      .addReg(PCReg, RegState::Define, AMDGPU::sub1)

      .addReg(PCReg, 0, AMDGPU::sub1)

      .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);

  ApplyHazardWorkarounds();


  // Insert the indirect branch after the other terminator.

  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))

    .addReg(PCReg);


  // If a spill is needed for the pc register pair, we need to insert a spill

  // restore block right before the destination block, and insert a short branch

  // into the old destination block's fallthrough predecessor.

  // e.g.:

  //

  // s_cbranch_scc0 skip_long_branch:

  //

  // long_branch_bb:

  //   spill s[8:9]

  //   s_getpc_b64 s[8:9]

  //   s_add_u32 s8, s8, restore_bb

  //   s_addc_u32 s9, s9, 0

  //   s_setpc_b64 s[8:9]

  //

  // skip_long_branch:

  //   foo;

  //

  // .....

  //

  // dest_bb_fallthrough_predecessor:

  // bar;

  // s_branch dest_bb

  //

  // restore_bb:

  //  restore s[8:9]

  //  fallthrough dest_bb

  ///

  // dest_bb:

  //   buzz;


  Register LongBranchReservedReg = MFI->getLongBranchReservedReg();

  Register Scav;


  // If we've previously reserved a register for long branches

  // avoid running the scavenger and just use those registers

  if (LongBranchReservedReg) {

    RS->enterBasicBlock(MBB);

    Scav = LongBranchReservedReg;

  } else {

    RS->enterBasicBlockEnd(MBB);

    Scav = RS->scavengeRegisterBackwards(

        AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),

        /* RestoreAfter */ false, 0, /* AllowSpill */ false);

  }

  if (Scav) {

    RS->setRegUsed(Scav);

    MRI.replaceRegWith(PCReg, Scav);

    MRI.clearVirtRegs();

  } else {

    // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for

    // SGPR spill.

    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

    const SIRegisterInfo *TRI = ST.getRegisterInfo();

    TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);

    MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);

    MRI.clearVirtRegs();

  }


  MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();

  // Now, the distance could be defined.

  auto *Offset = MCBinaryExpr::createSub(

      MCSymbolRefExpr::create(DestLabel, MCCtx),

      MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);

  // Add offset assignments.

  auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);

  OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));

  auto *ShAmt = MCConstantExpr::create(32, MCCtx);

  OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));

}


unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {

  switch (Cond) {

  case SIInstrInfo::SCC_TRUE:

    return AMDGPU::S_CBRANCH_SCC1;

  case SIInstrInfo::SCC_FALSE:

    return AMDGPU::S_CBRANCH_SCC0;

  case SIInstrInfo::VCCNZ:

    return AMDGPU::S_CBRANCH_VCCNZ;

  case SIInstrInfo::VCCZ:

    return AMDGPU::S_CBRANCH_VCCZ;

  case SIInstrInfo::EXECNZ:

    return AMDGPU::S_CBRANCH_EXECNZ;

  case SIInstrInfo::EXECZ:

    return AMDGPU::S_CBRANCH_EXECZ;

  default:

    llvm_unreachable("invalid branch predicate");

  }

}


SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {

  switch (Opcode) {

  case AMDGPU::S_CBRANCH_SCC0:

    return SCC_FALSE;

  case AMDGPU::S_CBRANCH_SCC1:

    return SCC_TRUE;

  case AMDGPU::S_CBRANCH_VCCNZ:

    return VCCNZ;

  case AMDGPU::S_CBRANCH_VCCZ:

    return VCCZ;

  case AMDGPU::S_CBRANCH_EXECNZ:

    return EXECNZ;

  case AMDGPU::S_CBRANCH_EXECZ:

    return EXECZ;

  default:

    return INVALID_BR;

  }

}


bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,

                                    MachineBasicBlock::iterator I,

                                    MachineBasicBlock *&TBB,

                                    MachineBasicBlock *&FBB,

                                    SmallVectorImpl<MachineOperand> &Cond,

                                    bool AllowModify) const {

  if (I->getOpcode() == AMDGPU::S_BRANCH) {

    // Unconditional Branch

    TBB = I->getOperand(0).getMBB();

    return false;

  }


  BranchPredicate Pred = getBranchPredicate(I->getOpcode());

  if (Pred == INVALID_BR)

    return true;


  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();

  Cond.push_back(MachineOperand::CreateImm(Pred));

  Cond.push_back(I->getOperand(1)); // Save the branch register.


  ++I;


  if (I == MBB.end()) {

    // Conditional branch followed by fall-through.

    TBB = CondBB;

    return false;

  }


  if (I->getOpcode() == AMDGPU::S_BRANCH) {

    TBB = CondBB;

    FBB = I->getOperand(0).getMBB();

    return false;

  }


  return true;

}


bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,

                                MachineBasicBlock *&FBB,

                                SmallVectorImpl<MachineOperand> &Cond,

                                bool AllowModify) const {

  MachineBasicBlock::iterator I = MBB.getFirstTerminator();

  auto E = MBB.end();

  if (I == E)

    return false;


  // Skip over the instructions that are artificially terminators for special

  // exec management.

  while (I != E && !I->isBranch() && !I->isReturn()) {

    switch (I->getOpcode()) {

    case AMDGPU::S_MOV_B64_term:

    case AMDGPU::S_XOR_B64_term:

    case AMDGPU::S_OR_B64_term:

    case AMDGPU::S_ANDN2_B64_term:

    case AMDGPU::S_AND_B64_term:

    case AMDGPU::S_AND_SAVEEXEC_B64_term:

    case AMDGPU::S_MOV_B32_term:

    case AMDGPU::S_XOR_B32_term:

    case AMDGPU::S_OR_B32_term:

    case AMDGPU::S_ANDN2_B32_term:

    case AMDGPU::S_AND_B32_term:

    case AMDGPU::S_AND_SAVEEXEC_B32_term:

      break;

    case AMDGPU::SI_IF:

    case AMDGPU::SI_ELSE:

    case AMDGPU::SI_KILL_I1_TERMINATOR:

    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:

      // FIXME: It's messy that these need to be considered here at all.

      return true;

    default:

      llvm_unreachable("unexpected non-branch terminator inst");

    }


    ++I;

  }


  if (I == E)

    return false;


  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);

}


unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,

                                   int *BytesRemoved) const {

  unsigned Count = 0;

  unsigned RemovedSize = 0;

  for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {

    // Skip over artificial terminators when removing instructions.

    if (MI.isBranch() || MI.isReturn()) {

      RemovedSize += getInstSizeInBytes(MI);

      MI.eraseFromParent();

      ++Count;

    }

  }


  if (BytesRemoved)

    *BytesRemoved = RemovedSize;


  return Count;

}


// Copy the flags onto the implicit condition register operand.


static void preserveCondRegFlags(MachineOperand &CondReg,

                                 const MachineOperand &OrigCond) {

  CondReg.setIsUndef(OrigCond.isUndef());

  CondReg.setIsKill(OrigCond.isKill());

}


unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,

                                   MachineBasicBlock *TBB,

                                   MachineBasicBlock *FBB,

                                   ArrayRef<MachineOperand> Cond,

                                   const DebugLoc &DL,

                                   int *BytesAdded) const {

  if (!FBB && Cond.empty()) {

    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))

      .addMBB(TBB);

    if (BytesAdded)

      *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;

    return 1;

  }


  assert(TBB && Cond[0].isImm());


  unsigned Opcode

    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));


  if (!FBB) {

    MachineInstr *CondBr =

      BuildMI(&MBB, DL, get(Opcode))

      .addMBB(TBB);


    // Copy the flags onto the implicit condition register operand.

    preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);

    fixImplicitOperands(*CondBr);


    if (BytesAdded)

      *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;

    return 1;

  }


  assert(TBB && FBB);


  MachineInstr *CondBr =

    BuildMI(&MBB, DL, get(Opcode))

    .addMBB(TBB);

  fixImplicitOperands(*CondBr);

  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))

    .addMBB(FBB);


  MachineOperand &CondReg = CondBr->getOperand(1);

  CondReg.setIsUndef(Cond[1].isUndef());

  CondReg.setIsKill(Cond[1].isKill());


  if (BytesAdded)

    *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;


  return 2;

}


bool SIInstrInfo::reverseBranchCondition(

  SmallVectorImpl<MachineOperand> &Cond) const {

  if (Cond.size() != 2) {

    return true;

  }


  if (Cond[0].isImm()) {

    Cond[0].setImm(-Cond[0].getImm());

    return false;

  }


  return true;

}


bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,

                                  ArrayRef<MachineOperand> Cond,

                                  Register DstReg, Register TrueReg,

                                  Register FalseReg, int &CondCycles,

                                  int &TrueCycles, int &FalseCycles) const {

  switch (Cond[0].getImm()) {

  case VCCNZ:

  case VCCZ: {

    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);

    if (MRI.getRegClass(FalseReg) != RC)

      return false;


    int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;

    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???


    // Limit to equal cost for branch vs. N v_cndmask_b32s.

    return RI.hasVGPRs(RC) && NumInsts <= 6;

  }

  case SCC_TRUE:

  case SCC_FALSE: {

    // FIXME: We could insert for VGPRs if we could replace the original compare

    // with a vector one.

    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);

    if (MRI.getRegClass(FalseReg) != RC)

      return false;


    int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;


    // Multiples of 8 can do s_cselect_b64

    if (NumInsts % 2 == 0)

      NumInsts /= 2;


    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???

    return RI.isSGPRClass(RC);

  }

  default:

    return false;

  }

}


void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,

                               MachineBasicBlock::iterator I, const DebugLoc &DL,

                               Register DstReg, ArrayRef<MachineOperand> Cond,

                               Register TrueReg, Register FalseReg) const {

  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());

  if (Pred == VCCZ || Pred == SCC_FALSE) {

    Pred = static_cast<BranchPredicate>(-Pred);

    std::swap(TrueReg, FalseReg);

  }


  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);

  unsigned DstSize = RI.getRegSizeInBits(*DstRC);


  if (DstSize == 32) {

    MachineInstr *Select;

    if (Pred == SCC_TRUE) {

      Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)

        .addReg(TrueReg)

        .addReg(FalseReg);

    } else {

      // Instruction's operands are backwards from what is expected.

      Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)

        .addReg(FalseReg)

        .addReg(TrueReg);

    }


    preserveCondRegFlags(Select->getOperand(3), Cond[1]);

    return;

  }


  if (DstSize == 64 && Pred == SCC_TRUE) {

    MachineInstr *Select =

      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)

      .addReg(TrueReg)

      .addReg(FalseReg);


    preserveCondRegFlags(Select->getOperand(3), Cond[1]);

    return;

  }


  static const int16_t Sub0_15[] = {

    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,

    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,

    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,

    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,

  };


  static const int16_t Sub0_15_64[] = {

    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,

    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,

    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,

    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,

  };


  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;

  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;

  const int16_t *SubIndices = Sub0_15;

  int NElts = DstSize / 32;


  // 64-bit select is only available for SALU.

  // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.

  if (Pred == SCC_TRUE) {

    if (NElts % 2) {

      SelOp = AMDGPU::S_CSELECT_B32;

      EltRC = &AMDGPU::SGPR_32RegClass;

    } else {

      SelOp = AMDGPU::S_CSELECT_B64;

      EltRC = &AMDGPU::SGPR_64RegClass;

      SubIndices = Sub0_15_64;

      NElts /= 2;

    }

  }


  MachineInstrBuilder MIB = BuildMI(

    MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);


  I = MIB->getIterator();


  SmallVector<Register, 8> Regs;

  for (int Idx = 0; Idx != NElts; ++Idx) {

    Register DstElt = MRI.createVirtualRegister(EltRC);

    Regs.push_back(DstElt);


    unsigned SubIdx = SubIndices[Idx];


    MachineInstr *Select;

    if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {

      Select =

        BuildMI(MBB, I, DL, get(SelOp), DstElt)

        .addReg(FalseReg, 0, SubIdx)

        .addReg(TrueReg, 0, SubIdx);

    } else {

      Select =

        BuildMI(MBB, I, DL, get(SelOp), DstElt)

        .addReg(TrueReg, 0, SubIdx)

        .addReg(FalseReg, 0, SubIdx);

    }


    preserveCondRegFlags(Select->getOperand(3), Cond[1]);

    fixImplicitOperands(*Select);


    MIB.addReg(DstElt)

       .addImm(SubIdx);

  }

}


bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {

  switch (MI.getOpcode()) {

  case AMDGPU::V_MOV_B16_t16_e32:

  case AMDGPU::V_MOV_B16_t16_e64:

  case AMDGPU::V_MOV_B32_e32:

  case AMDGPU::V_MOV_B32_e64:

  case AMDGPU::V_MOV_B64_PSEUDO:

  case AMDGPU::V_MOV_B64_e32:

  case AMDGPU::V_MOV_B64_e64:

  case AMDGPU::S_MOV_B32:

  case AMDGPU::S_MOV_B64:

  case AMDGPU::S_MOV_B64_IMM_PSEUDO:

  case AMDGPU::COPY:

  case AMDGPU::WWM_COPY:

  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:

  case AMDGPU::V_ACCVGPR_READ_B32_e64:

  case AMDGPU::V_ACCVGPR_MOV_B32:

  case AMDGPU::AV_MOV_B32_IMM_PSEUDO:

  case AMDGPU::AV_MOV_B64_IMM_PSEUDO:

    return true;

  default:

    return false;

  }

}


static constexpr AMDGPU::OpName ModifierOpNames[] = {

    AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,

    AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,

    AMDGPU::OpName::omod,           AMDGPU::OpName::op_sel};


void SIInstrInfo::removeModOperands(MachineInstr &MI) const {

  unsigned Opc = MI.getOpcode();

  for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {

    int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);

    if (Idx >= 0)

      MI.removeOperand(Idx);

  }

}


std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,

                                                         unsigned SubRegIndex) {

  switch (SubRegIndex) {

  case AMDGPU::NoSubRegister:

    return Imm;

  case AMDGPU::sub0:

    return SignExtend64<32>(Imm);

  case AMDGPU::sub1:

    return SignExtend64<32>(Imm >> 32);

  case AMDGPU::lo16:

    return SignExtend64<16>(Imm);

  case AMDGPU::hi16:

    return SignExtend64<16>(Imm >> 16);

  case AMDGPU::sub1_lo16:

    return SignExtend64<16>(Imm >> 32);

  case AMDGPU::sub1_hi16:

    return SignExtend64<16>(Imm >> 48);

  default:

    return std::nullopt;

  }


  llvm_unreachable("covered subregister switch");

}


static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {

  switch (Opc) {

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_MAC_F16_e64:

  case AMDGPU::V_MAD_F16_e64:

    return AMDGPU::V_MADAK_F16;

  case AMDGPU::V_MAC_F32_e32:

  case AMDGPU::V_MAC_F32_e64:

  case AMDGPU::V_MAD_F32_e64:

    return AMDGPU::V_MADAK_F32;

  case AMDGPU::V_FMAC_F32_e32:

  case AMDGPU::V_FMAC_F32_e64:

  case AMDGPU::V_FMA_F32_e64:

    return AMDGPU::V_FMAAK_F32;

  case AMDGPU::V_FMAC_F16_e32:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_t16_e64:

  case AMDGPU::V_FMAC_F16_fake16_e64:

  case AMDGPU::V_FMA_F16_e64:

    return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()

                                        ? AMDGPU::V_FMAAK_F16_t16

                                        : AMDGPU::V_FMAAK_F16_fake16

                                  : AMDGPU::V_FMAAK_F16;

  case AMDGPU::V_FMAC_F64_e32:

  case AMDGPU::V_FMAC_F64_e64:

  case AMDGPU::V_FMA_F64_e64:

    return AMDGPU::V_FMAAK_F64;

  default:

    llvm_unreachable("invalid instruction");

  }

}


static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {

  switch (Opc) {

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_MAC_F16_e64:

  case AMDGPU::V_MAD_F16_e64:

    return AMDGPU::V_MADMK_F16;

  case AMDGPU::V_MAC_F32_e32:

  case AMDGPU::V_MAC_F32_e64:

  case AMDGPU::V_MAD_F32_e64:

    return AMDGPU::V_MADMK_F32;

  case AMDGPU::V_FMAC_F32_e32:

  case AMDGPU::V_FMAC_F32_e64:

  case AMDGPU::V_FMA_F32_e64:

    return AMDGPU::V_FMAMK_F32;

  case AMDGPU::V_FMAC_F16_e32:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_t16_e64:

  case AMDGPU::V_FMAC_F16_fake16_e64:

  case AMDGPU::V_FMA_F16_e64:

    return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()

                                        ? AMDGPU::V_FMAMK_F16_t16

                                        : AMDGPU::V_FMAMK_F16_fake16

                                  : AMDGPU::V_FMAMK_F16;

  case AMDGPU::V_FMAC_F64_e32:

  case AMDGPU::V_FMAC_F64_e64:

  case AMDGPU::V_FMA_F64_e64:

    return AMDGPU::V_FMAMK_F64;

  default:

    llvm_unreachable("invalid instruction");

  }

}


bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,

                                Register Reg, MachineRegisterInfo *MRI) const {

  int64_t Imm;

  if (!getConstValDefinedInReg(DefMI, Reg, Imm))

    return false;


  const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);


  assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");


  unsigned Opc = UseMI.getOpcode();

  if (Opc == AMDGPU::COPY) {

    assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");


    Register DstReg = UseMI.getOperand(0).getReg();

    Register UseSubReg = UseMI.getOperand(1).getSubReg();


    const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);


    if (HasMultipleUses) {

      // TODO: This should fold in more cases with multiple use, but we need to

      // more carefully consider what those uses are.

      unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));


      // Avoid breaking up a 64-bit inline immediate into a subregister extract.

      if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)

        return false;


      // Most of the time folding a 32-bit inline constant is free (though this

      // might not be true if we can't later fold it into a real user).

      //

      // FIXME: This isInlineConstant check is imprecise if

      // getConstValDefinedInReg handled the tricky non-mov cases.

      if (ImmDefSize == 32 &&

          !isInlineConstant(Imm, AMDGPU::OPERAND_REG_IMM_INT32))

        return false;

    }


    bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&

                   RI.getSubRegIdxSize(UseSubReg) == 16;


    if (Is16Bit) {

      if (RI.hasVGPRs(DstRC))

        return false; // Do not clobber vgpr_hi16


      if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)

        return false;

    }


    MachineFunction *MF = UseMI.getMF();


    unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;

    MCRegister MovDstPhysReg =

        DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();


    std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);


    // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64

    for (unsigned MovOp :

         {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,

          AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {

      const MCInstrDesc &MovDesc = get(MovOp);


      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);

      if (Is16Bit) {

        // We just need to find a correctly sized register class, so the

        // subregister index compatibility doesn't matter since we're statically

        // extracting the immediate value.

        MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);

        if (!MovDstRC)

          continue;


        if (MovDstPhysReg) {

          // FIXME: We probably should not do this. If there is a live value in

          // the high half of the register, it will be corrupted.

          MovDstPhysReg =

              RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);

          if (!MovDstPhysReg)

            continue;

        }

      }


      // Result class isn't the right size, try the next instruction.

      if (MovDstPhysReg) {

        if (!MovDstRC->contains(MovDstPhysReg))

          return false;

      } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {

        // TODO: This will be overly conservative in the case of 16-bit virtual

        // SGPRs. We could hack up the virtual register uses to use a compatible

        // 32-bit class.

        continue;

      }


      const MCOperandInfo &OpInfo = MovDesc.operands()[1];


      // Ensure the interpreted immediate value is a valid operand in the new

      // mov.

      //

      // FIXME: isImmOperandLegal should have form that doesn't require existing

      // MachineInstr or MachineOperand

      if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&

          !isInlineConstant(*SubRegImm, OpInfo.OperandType))

        break;


      NewOpc = MovOp;

      break;

    }


    if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)

      return false;


    if (Is16Bit) {

      UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);

      if (MovDstPhysReg)

        UseMI.getOperand(0).setReg(MovDstPhysReg);

      assert(UseMI.getOperand(1).getReg().isVirtual());

    }


    const MCInstrDesc &NewMCID = get(NewOpc);

    UseMI.setDesc(NewMCID);

    UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);

    UseMI.addImplicitDefUseOperands(*MF);

    return true;

  }


  if (HasMultipleUses)

    return false;


  if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||

      Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||

      Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||

      Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||

      Opc == AMDGPU::V_FMAC_F16_t16_e64 ||

      Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||

      Opc == AMDGPU::V_FMAC_F64_e64) {

    // Don't fold if we are using source or output modifiers. The new VOP2

    // instructions don't have them.

    if (hasAnyModifiersSet(UseMI))

      return false;


    // If this is a free constant, there's no reason to do this.

    // TODO: We could fold this here instead of letting SIFoldOperands do it

    // later.

    int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);


    // Any src operand can be used for the legality check.

    if (isInlineConstant(UseMI, Src0Idx, Imm))

      return false;


    MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);


    MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);

    MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);


    // Multiplied part is the constant: Use v_madmk_{f16, f32}.

    if ((Src0->isReg() && Src0->getReg() == Reg) ||

        (Src1->isReg() && Src1->getReg() == Reg)) {

      MachineOperand *RegSrc =

          Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;

      if (!RegSrc->isReg())

        return false;

      if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&

          ST.getConstantBusLimit(Opc) < 2)

        return false;


      if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))

        return false;


      // If src2 is also a literal constant then we have to choose which one to

      // fold. In general it is better to choose madak so that the other literal

      // can be materialized in an sgpr instead of a vgpr:

      //   s_mov_b32 s0, literal

      //   v_madak_f32 v0, s0, v0, literal

      // Instead of:

      //   v_mov_b32 v1, literal

      //   v_madmk_f32 v0, v0, literal, v1

      MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());

      if (Def && Def->isMoveImmediate() &&

          !isInlineConstant(Def->getOperand(1)))

        return false;


      unsigned NewOpc = getNewFMAMKInst(ST, Opc);

      if (pseudoToMCOpcode(NewOpc) == -1)

        return false;


      // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16

      // takes VGPR_32_Lo128 operands, so the rewrite would also require

      // restricting their register classes. For now just bail out.

      if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||

          NewOpc == AMDGPU::V_FMAMK_F16_fake16)

        return false;


      const std::optional<int64_t> SubRegImm = extractSubregFromImm(

          Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());


      // FIXME: This would be a lot easier if we could return a new instruction

      // instead of having to modify in place.


      Register SrcReg = RegSrc->getReg();

      unsigned SrcSubReg = RegSrc->getSubReg();

      Src0->setReg(SrcReg);

      Src0->setSubReg(SrcSubReg);

      Src0->setIsKill(RegSrc->isKill());


      if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||

          Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||

          Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||

          Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)

        UseMI.untieRegOperand(

            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));


      Src1->ChangeToImmediate(*SubRegImm);


      removeModOperands(UseMI);

      UseMI.setDesc(get(NewOpc));


      bool DeleteDef = MRI->use_nodbg_empty(Reg);

      if (DeleteDef)

        DefMI.eraseFromParent();


      return true;

    }


    // Added part is the constant: Use v_madak_{f16, f32}.

    if (Src2->isReg() && Src2->getReg() == Reg) {

      if (ST.getConstantBusLimit(Opc) < 2) {

        // Not allowed to use constant bus for another operand.

        // We can however allow an inline immediate as src0.

        bool Src0Inlined = false;

        if (Src0->isReg()) {

          // Try to inline constant if possible.

          // If the Def moves immediate and the use is single

          // We are saving VGPR here.

          MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());

          if (Def && Def->isMoveImmediate() &&

              isInlineConstant(Def->getOperand(1)) &&

              MRI->hasOneUse(Src0->getReg())) {

            Src0->ChangeToImmediate(Def->getOperand(1).getImm());

            Src0Inlined = true;

          } else if (ST.getConstantBusLimit(Opc) <= 1 &&

                     RI.isSGPRReg(*MRI, Src0->getReg())) {

            return false;

          }

          // VGPR is okay as Src0 - fallthrough

        }


        if (Src1->isReg() && !Src0Inlined) {

          // We have one slot for inlinable constant so far - try to fill it

          MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());

          if (Def && Def->isMoveImmediate() &&

              isInlineConstant(Def->getOperand(1)) &&

              MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))

            Src0->ChangeToImmediate(Def->getOperand(1).getImm());

          else if (RI.isSGPRReg(*MRI, Src1->getReg()))

            return false;

          // VGPR is okay as Src1 - fallthrough

        }

      }


      unsigned NewOpc = getNewFMAAKInst(ST, Opc);

      if (pseudoToMCOpcode(NewOpc) == -1)

        return false;


      // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16

      // takes VGPR_32_Lo128 operands, so the rewrite would also require

      // restricting their register classes. For now just bail out.

      if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||

          NewOpc == AMDGPU::V_FMAAK_F16_fake16)

        return false;


      // FIXME: This would be a lot easier if we could return a new instruction

      // instead of having to modify in place.


      if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||

          Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||

          Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||

          Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)

        UseMI.untieRegOperand(

            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));


      const std::optional<int64_t> SubRegImm =

          extractSubregFromImm(Imm, Src2->getSubReg());


      // ChangingToImmediate adds Src2 back to the instruction.

      Src2->ChangeToImmediate(*SubRegImm);


      // These come before src2.

      removeModOperands(UseMI);

      UseMI.setDesc(get(NewOpc));

      // It might happen that UseMI was commuted

      // and we now have SGPR as SRC1. If so 2 inlined

      // constant and SGPR are illegal.

      legalizeOperands(UseMI);


      bool DeleteDef = MRI->use_nodbg_empty(Reg);

      if (DeleteDef)

        DefMI.eraseFromParent();


      return true;

    }

  }


  return false;

}


static bool


memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,

                           ArrayRef<const MachineOperand *> BaseOps2) {

  if (BaseOps1.size() != BaseOps2.size())

    return false;

  for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {

    if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))

      return false;

  }

  return true;

}


static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,

                                LocationSize WidthB, int OffsetB) {

  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;

  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;

  LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;

  return LowWidth.hasValue() &&

         LowOffset + (int)LowWidth.getValue() <= HighOffset;

}


bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,

                                               const MachineInstr &MIb) const {

  SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;

  int64_t Offset0, Offset1;

  LocationSize Dummy0 = LocationSize::precise(0);

  LocationSize Dummy1 = LocationSize::precise(0);

  bool Offset0IsScalable, Offset1IsScalable;

  if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,

                                     Dummy0, &RI) ||

      !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,

                                     Dummy1, &RI))

    return false;


  if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))

    return false;


  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {

    // FIXME: Handle ds_read2 / ds_write2.

    return false;

  }

  LocationSize Width0 = MIa.memoperands().front()->getSize();

  LocationSize Width1 = MIb.memoperands().front()->getSize();

  return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);

}


bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,

                                                  const MachineInstr &MIb) const {

  assert(MIa.mayLoadOrStore() &&

         "MIa must load from or modify a memory location");

  assert(MIb.mayLoadOrStore() &&

         "MIb must load from or modify a memory location");


  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())

    return false;


  // XXX - Can we relax this between address spaces?

  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())

    return false;


  if (isLDSDMA(MIa) || isLDSDMA(MIb))

    return false;


  // TODO: Should we check the address space from the MachineMemOperand? That

  // would allow us to distinguish objects we know don't alias based on the

  // underlying address space, even if it was lowered to a different one,

  // e.g. private accesses lowered to use MUBUF instructions on a scratch

  // buffer.

  if (isDS(MIa)) {

    if (isDS(MIb))

      return checkInstOffsetsDoNotOverlap(MIa, MIb);


    return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);

  }


  if (isMUBUF(MIa) || isMTBUF(MIa)) {

    if (isMUBUF(MIb) || isMTBUF(MIb))

      return checkInstOffsetsDoNotOverlap(MIa, MIb);


    if (isFLAT(MIb))

      return isFLATScratch(MIb);


    return !isSMRD(MIb);

  }


  if (isSMRD(MIa)) {

    if (isSMRD(MIb))

      return checkInstOffsetsDoNotOverlap(MIa, MIb);


    if (isFLAT(MIb))

      return isFLATScratch(MIb);


    return !isMUBUF(MIb) && !isMTBUF(MIb);

  }


  if (isFLAT(MIa)) {

    if (isFLAT(MIb)) {

      if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||

          (isFLATGlobal(MIa) && isFLATScratch(MIb)))

        return true;


      return checkInstOffsetsDoNotOverlap(MIa, MIb);

    }


    return false;

  }


  return false;

}


static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,

                           int64_t &Imm, MachineInstr **DefMI = nullptr) {

  if (Reg.isPhysical())

    return false;

  auto *Def = MRI.getUniqueVRegDef(Reg);

  if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {

    Imm = Def->getOperand(1).getImm();

    if (DefMI)

      *DefMI = Def;

    return true;

  }

  return false;

}


static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,

                           MachineInstr **DefMI = nullptr) {

  if (!MO->isReg())

    return false;

  const MachineFunction *MF = MO->getParent()->getParent()->getParent();

  const MachineRegisterInfo &MRI = MF->getRegInfo();

  return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);

}


static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,

                                MachineInstr &NewMI) {

  if (LV) {

    unsigned NumOps = MI.getNumOperands();

    for (unsigned I = 1; I < NumOps; ++I) {

      MachineOperand &Op = MI.getOperand(I);

      if (Op.isReg() && Op.isKill())

        LV->replaceKillInstruction(Op.getReg(), MI, NewMI);

    }

  }

}


static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {

  switch (Opc) {

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_MAC_F16_e64:

    return AMDGPU::V_MAD_F16_e64;

  case AMDGPU::V_MAC_F32_e32:

  case AMDGPU::V_MAC_F32_e64:

    return AMDGPU::V_MAD_F32_e64;

  case AMDGPU::V_MAC_LEGACY_F32_e32:

  case AMDGPU::V_MAC_LEGACY_F32_e64:

    return AMDGPU::V_MAD_LEGACY_F32_e64;

  case AMDGPU::V_FMAC_LEGACY_F32_e32:

  case AMDGPU::V_FMAC_LEGACY_F32_e64:

    return AMDGPU::V_FMA_LEGACY_F32_e64;

  case AMDGPU::V_FMAC_F16_e32:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_t16_e64:

  case AMDGPU::V_FMAC_F16_fake16_e64:

    return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()

                                        ? AMDGPU::V_FMA_F16_gfx9_t16_e64

                                        : AMDGPU::V_FMA_F16_gfx9_fake16_e64

                                  : AMDGPU::V_FMA_F16_gfx9_e64;

  case AMDGPU::V_FMAC_F32_e32:

  case AMDGPU::V_FMAC_F32_e64:

    return AMDGPU::V_FMA_F32_e64;

  case AMDGPU::V_FMAC_F64_e32:

  case AMDGPU::V_FMAC_F64_e64:

    return AMDGPU::V_FMA_F64_e64;

  default:

    llvm_unreachable("invalid instruction");

  }

}


MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,

                                                 LiveVariables *LV,

                                                 LiveIntervals *LIS) const {

  MachineBasicBlock &MBB = *MI.getParent();

  unsigned Opc = MI.getOpcode();


  // Handle MFMA.

  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);

  if (NewMFMAOpc != -1) {

    MachineInstrBuilder MIB =

        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));

    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)

      MIB.add(MI.getOperand(I));

    updateLiveVariables(LV, MI, *MIB);

    if (LIS) {

      LIS->ReplaceMachineInstrInMaps(MI, *MIB);

      // SlotIndex of defs needs to be updated when converting to early-clobber

      MachineOperand &Def = MIB->getOperand(0);

      if (Def.isEarlyClobber() && Def.isReg() &&

          LIS->hasInterval(Def.getReg())) {

        SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);

        SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);

        auto &LI = LIS->getInterval(Def.getReg());

        auto UpdateDefIndex = [&](LiveRange &LR) {

          auto *S = LR.find(OldIndex);

          if (S != LR.end() && S->start == OldIndex) {

            assert(S->valno && S->valno->def == OldIndex);

            S->start = NewIndex;

            S->valno->def = NewIndex;

          }

        };

        UpdateDefIndex(LI);

        for (auto &SR : LI.subranges())

          UpdateDefIndex(SR);

      }

    }

    return MIB;

  }


  if (SIInstrInfo::isWMMA(MI)) {

    unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());

    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))

                                  .setMIFlags(MI.getFlags());

    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)

      MIB->addOperand(MI.getOperand(I));


    updateLiveVariables(LV, MI, *MIB);

    if (LIS)

      LIS->ReplaceMachineInstrInMaps(MI, *MIB);


    return MIB;

  }


  assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&

         Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&

         "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "

         "present pre-RA");


  // Handle MAC/FMAC.

  bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;

  bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||

                  Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||

                  Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||

                  Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;

  bool Src0Literal = false;


  switch (Opc) {

  default:

    return nullptr;

  case AMDGPU::V_MAC_F16_e64:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_t16_e64:

  case AMDGPU::V_FMAC_F16_fake16_e64:

  case AMDGPU::V_MAC_F32_e64:

  case AMDGPU::V_MAC_LEGACY_F32_e64:

  case AMDGPU::V_FMAC_F32_e64:

  case AMDGPU::V_FMAC_LEGACY_F32_e64:

  case AMDGPU::V_FMAC_F64_e64:

    break;

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_FMAC_F16_e32:

  case AMDGPU::V_MAC_F32_e32:

  case AMDGPU::V_MAC_LEGACY_F32_e32:

  case AMDGPU::V_FMAC_F32_e32:

  case AMDGPU::V_FMAC_LEGACY_F32_e32:

  case AMDGPU::V_FMAC_F64_e32: {

    int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

                                             AMDGPU::OpName::src0);

    const MachineOperand *Src0 = &MI.getOperand(Src0Idx);

    if (!Src0->isReg() && !Src0->isImm())

      return nullptr;


    if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))

      Src0Literal = true;


    break;

  }

  }


  MachineInstrBuilder MIB;

  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);

  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);

  const MachineOperand *Src0Mods =

    getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);

  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);

  const MachineOperand *Src1Mods =

    getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);

  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);

  const MachineOperand *Src2Mods =

      getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);

  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);

  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);

  const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);


  if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&

      (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&

      // If we have an SGPR input, we will violate the constant bus restriction.

      (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||

       !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {

    MachineInstr *DefMI;

    const auto killDef = [&]() -> void {

      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

      // The only user is the instruction which will be killed.

      Register DefReg = DefMI->getOperand(0).getReg();


      if (MRI.hasOneNonDBGUse(DefReg)) {

        // We cannot just remove the DefMI here, calling pass will crash.

        DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));

        DefMI->getOperand(0).setIsDead(true);

        for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)

          DefMI->removeOperand(I);

        if (LV)

          LV->getVarInfo(DefReg).AliveBlocks.clear();

      }


      if (LIS) {

        LiveInterval &DefLI = LIS->getInterval(DefReg);


        // We cannot delete the original instruction here, so hack out the use

        // in the original instruction with a dummy register so we can use

        // shrinkToUses to deal with any multi-use edge cases. Other targets do

        // not have the complexity of deleting a use to consider here.

        Register DummyReg = MRI.cloneVirtualRegister(DefReg);

        for (MachineOperand &MIOp : MI.uses()) {

          if (MIOp.isReg() && MIOp.getReg() == DefReg) {

            MIOp.setIsUndef(true);

            MIOp.setReg(DummyReg);

          }

        }


        LIS->shrinkToUses(&DefLI);

      }

    };


    int64_t Imm;

    if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {

      unsigned NewOpc = getNewFMAAKInst(ST, Opc);

      if (pseudoToMCOpcode(NewOpc) != -1) {

        MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))

                  .add(*Dst)

                  .add(*Src0)

                  .add(*Src1)

                  .addImm(Imm)

                  .setMIFlags(MI.getFlags());

        updateLiveVariables(LV, MI, *MIB);

        if (LIS)

          LIS->ReplaceMachineInstrInMaps(MI, *MIB);

        killDef();

        return MIB;

      }

    }

    unsigned NewOpc = getNewFMAMKInst(ST, Opc);

    if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {

      if (pseudoToMCOpcode(NewOpc) != -1) {

        MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))

                  .add(*Dst)

                  .add(*Src0)

                  .addImm(Imm)

                  .add(*Src2)

                  .setMIFlags(MI.getFlags());

        updateLiveVariables(LV, MI, *MIB);


        if (LIS)

          LIS->ReplaceMachineInstrInMaps(MI, *MIB);

        killDef();

        return MIB;

      }

    }

    if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {

      if (Src0Literal) {

        Imm = Src0->getImm();

        DefMI = nullptr;

      }

      if (pseudoToMCOpcode(NewOpc) != -1 &&

          isOperandLegal(

              MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),

              Src1)) {

        MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))

                  .add(*Dst)

                  .add(*Src1)

                  .addImm(Imm)

                  .add(*Src2)

                  .setMIFlags(MI.getFlags());

        updateLiveVariables(LV, MI, *MIB);


        if (LIS)

          LIS->ReplaceMachineInstrInMaps(MI, *MIB);

        if (DefMI)

          killDef();

        return MIB;

      }

    }

  }


  // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma

  // if VOP3 does not allow a literal operand.

  if (Src0Literal && !ST.hasVOP3Literal())

    return nullptr;


  unsigned NewOpc = getNewFMAInst(ST, Opc);


  if (pseudoToMCOpcode(NewOpc) == -1)

    return nullptr;


  MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))

            .add(*Dst)

            .addImm(Src0Mods ? Src0Mods->getImm() : 0)

            .add(*Src0)

            .addImm(Src1Mods ? Src1Mods->getImm() : 0)

            .add(*Src1)

            .addImm(Src2Mods ? Src2Mods->getImm() : 0)

            .add(*Src2)

            .addImm(Clamp ? Clamp->getImm() : 0)

            .addImm(Omod ? Omod->getImm() : 0)

            .setMIFlags(MI.getFlags());

  if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))

    MIB.addImm(OpSel ? OpSel->getImm() : 0);

  updateLiveVariables(LV, MI, *MIB);

  if (LIS)

    LIS->ReplaceMachineInstrInMaps(MI, *MIB);

  return MIB;

}


// It's not generally safe to move VALU instructions across these since it will

// start using the register as a base index rather than directly.

// XXX - Why isn't hasSideEffects sufficient for these?


static bool changesVGPRIndexingMode(const MachineInstr &MI) {

  switch (MI.getOpcode()) {

  case AMDGPU::S_SET_GPR_IDX_ON:

  case AMDGPU::S_SET_GPR_IDX_MODE:

  case AMDGPU::S_SET_GPR_IDX_OFF:

    return true;

  default:

    return false;

  }

}


bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,

                                       const MachineBasicBlock *MBB,

                                       const MachineFunction &MF) const {

  // Skipping the check for SP writes in the base implementation. The reason it

  // was added was apparently due to compile time concerns.

  //

  // TODO: Do we really want this barrier? It triggers unnecessary hazard nops

  // but is probably avoidable.


  // Copied from base implementation.

  // Terminators and labels can't be scheduled around.

  if (MI.isTerminator() || MI.isPosition())

    return true;


  // INLINEASM_BR can jump to another block

  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)

    return true;


  if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)

    return true;


  // Target-independent instructions do not have an implicit-use of EXEC, even

  // when they operate on VGPRs. Treating EXEC modifications as scheduling

  // boundaries prevents incorrect movements of such instructions.

  return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||

         MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||

         MI.getOpcode() == AMDGPU::S_SETREG_B32 ||

         MI.getOpcode() == AMDGPU::S_SETPRIO ||

         MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||

         changesVGPRIndexingMode(MI);

}


bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {

  return Opcode == AMDGPU::DS_ORDERED_COUNT ||

         Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||

         Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);

}


bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {

  if (!isFLAT(MI) || isFLATGlobal(MI))

    return false;


  // If scratch is not initialized, we can never access it.

  if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))

    return false;


  // SCRATCH instructions always access scratch.

  if (isFLATScratch(MI))

    return true;


  // If there are no memory operands then conservatively assume the flat

  // operation may access scratch.

  if (MI.memoperands_empty())

    return true;


  // See if any memory operand specifies an address space that involves scratch.

  return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {

    unsigned AS = Memop->getAddrSpace();

    if (AS == AMDGPUAS::FLAT_ADDRESS) {

      const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;

      return !MD || !AMDGPU::hasValueInRangeLikeMetadata(

                        *MD, AMDGPUAS::PRIVATE_ADDRESS);

    }

    return AS == AMDGPUAS::PRIVATE_ADDRESS;

  });

}


bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {

  // Skip the full operand and register alias search modifiesRegister

  // does. There's only a handful of instructions that touch this, it's only an

  // implicit def, and doesn't alias any other registers.

  return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);

}


bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {

  unsigned Opcode = MI.getOpcode();


  if (MI.mayStore() && isSMRD(MI))

    return true; // scalar store or atomic


  // This will terminate the function when other lanes may need to continue.

  if (MI.isReturn())

    return true;


  // These instructions cause shader I/O that may cause hardware lockups

  // when executed with an empty EXEC mask.

  //

  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when

  //       EXEC = 0, but checking for that case here seems not worth it

  //       given the typical code patterns.

  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||

      isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||

      Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)

    return true;


  if (MI.isCall() || MI.isInlineAsm())

    return true; // conservative assumption


  // Assume that barrier interactions are only intended with active lanes.

  if (isBarrier(Opcode))

    return true;


  // A mode change is a scalar operation that influences vector instructions.

  if (modifiesModeRegister(MI))

    return true;


  // These are like SALU instructions in terms of effects, so it's questionable

  // whether we should return true for those.

  //

  // However, executing them with EXEC = 0 causes them to operate on undefined

  // data, which we avoid by returning true here.

  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||

      Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||

      Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||

      Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)

    return true;


  return false;

}


bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,

                              const MachineInstr &MI) const {

  if (MI.isMetaInstruction())

    return false;


  // This won't read exec if this is an SGPR->SGPR copy.

  if (MI.isCopyLike()) {

    if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))

      return true;


    // Make sure this isn't copying exec as a normal operand

    return MI.readsRegister(AMDGPU::EXEC, &RI);

  }


  // Make a conservative assumption about the callee.

  if (MI.isCall())

    return true;


  // Be conservative with any unhandled generic opcodes.

  if (!isTargetSpecificOpcode(MI.getOpcode()))

    return true;


  return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);

}


bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {

  switch (Imm.getBitWidth()) {

  case 1: // This likely will be a condition code mask.

    return true;


  case 32:

    return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),

                                        ST.hasInv2PiInlineImm());

  case 64:

    return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),

                                        ST.hasInv2PiInlineImm());

  case 16:

    return ST.has16BitInsts() &&

           AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),

                                         ST.hasInv2PiInlineImm());

  default:

    llvm_unreachable("invalid bitwidth");

  }

}


bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const {

  APInt IntImm = Imm.bitcastToAPInt();

  int64_t IntImmVal = IntImm.getSExtValue();

  bool HasInv2Pi = ST.hasInv2PiInlineImm();

  switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {

  default:

    llvm_unreachable("invalid fltSemantics");

  case APFloatBase::S_IEEEsingle:

  case APFloatBase::S_IEEEdouble:

    return isInlineConstant(IntImm);

  case APFloatBase::S_BFloat:

    return ST.has16BitInsts() &&

           AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);

  case APFloatBase::S_IEEEhalf:

    return ST.has16BitInsts() &&

           AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);

  }

}


bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {

  // MachineOperand provides no way to tell the true operand size, since it only

  // records a 64-bit value. We need to know the size to determine if a 32-bit

  // floating point immediate bit pattern is legal for an integer immediate. It

  // would be for any 32-bit integer operand, but would not be for a 64-bit one.

  switch (OperandType) {

  case AMDGPU::OPERAND_REG_IMM_INT32:

  case AMDGPU::OPERAND_REG_IMM_FP32:

  case AMDGPU::OPERAND_REG_INLINE_C_INT32:

  case AMDGPU::OPERAND_REG_INLINE_C_FP32:

  case AMDGPU::OPERAND_REG_IMM_V2FP32:

  case AMDGPU::OPERAND_REG_IMM_V2INT32:

  case AMDGPU::OPERAND_REG_INLINE_AC_INT32:

  case AMDGPU::OPERAND_REG_INLINE_AC_FP32:

  case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {

    int32_t Trunc = static_cast<int32_t>(Imm);

    return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());

  }

  case AMDGPU::OPERAND_REG_IMM_INT64:

  case AMDGPU::OPERAND_REG_IMM_FP64:

  case AMDGPU::OPERAND_REG_INLINE_C_INT64:

  case AMDGPU::OPERAND_REG_INLINE_C_FP64:

  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:

    return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());

  case AMDGPU::OPERAND_REG_IMM_INT16:

  case AMDGPU::OPERAND_REG_INLINE_C_INT16:

    // We would expect inline immediates to not be concerned with an integer/fp

    // distinction. However, in the case of 16-bit integer operations, the

    // "floating point" values appear to not work. It seems read the low 16-bits

    // of 32-bit immediates, which happens to always work for the integer

    // values.

    //

    // See llvm bugzilla 46302.

    //

    // TODO: Theoretically we could use op-sel to use the high bits of the

    // 32-bit FP values.

    return AMDGPU::isInlinableIntLiteral(Imm);

  case AMDGPU::OPERAND_REG_IMM_V2INT16:

  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:

    return AMDGPU::isInlinableLiteralV2I16(Imm);

  case AMDGPU::OPERAND_REG_IMM_V2FP16:

  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:

    return AMDGPU::isInlinableLiteralV2F16(Imm);

  case AMDGPU::OPERAND_REG_IMM_V2BF16:

  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:

    return AMDGPU::isInlinableLiteralV2BF16(Imm);

  case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:

    return false;

  case AMDGPU::OPERAND_REG_IMM_FP16:

  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {

    if (isInt<16>(Imm) || isUInt<16>(Imm)) {

      // A few special case instructions have 16-bit operands on subtargets

      // where 16-bit instructions are not legal.

      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle

      // constants in these cases

      int16_t Trunc = static_cast<int16_t>(Imm);

      return ST.has16BitInsts() &&

             AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());

    }


    return false;

  }

  case AMDGPU::OPERAND_REG_IMM_BF16:

  case AMDGPU::OPERAND_REG_INLINE_C_BF16: {

    if (isInt<16>(Imm) || isUInt<16>(Imm)) {

      int16_t Trunc = static_cast<int16_t>(Imm);

      return ST.has16BitInsts() &&

             AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());

    }

    return false;

  }

  case AMDGPU::OPERAND_KIMM32:

  case AMDGPU::OPERAND_KIMM16:

  case AMDGPU::OPERAND_KIMM64:

    return false;

  case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:

    return isLegalAV64PseudoImm(Imm);

  case AMDGPU::OPERAND_INPUT_MODS:

  case MCOI::OPERAND_IMMEDIATE:

    // Always embedded in the instruction for free.

    return true;

  case MCOI::OPERAND_UNKNOWN:

  case MCOI::OPERAND_REGISTER:

  case MCOI::OPERAND_PCREL:

  case MCOI::OPERAND_GENERIC_0:

  case MCOI::OPERAND_GENERIC_1:

  case MCOI::OPERAND_GENERIC_2:

  case MCOI::OPERAND_GENERIC_3:

  case MCOI::OPERAND_GENERIC_4:

  case MCOI::OPERAND_GENERIC_5:

    // Just ignore anything else.

    return true;

  default:

    llvm_unreachable("invalid operand type");

  }

}


static bool compareMachineOp(const MachineOperand &Op0,

                             const MachineOperand &Op1) {

  if (Op0.getType() != Op1.getType())

    return false;


  switch (Op0.getType()) {

  case MachineOperand::MO_Register:

    return Op0.getReg() == Op1.getReg();

  case MachineOperand::MO_Immediate:

    return Op0.getImm() == Op1.getImm();

  default:

    llvm_unreachable("Didn't expect to be comparing these operand types");

  }

}


bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,

                                        const MCOperandInfo &OpInfo) const {

  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)

    return true;


  if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))

    return false;


  if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))

    return true;


  return ST.hasVOP3Literal();

}


bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,

                                    int64_t ImmVal) const {

  const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];

  if (isInlineConstant(ImmVal, OpInfo.OperandType)) {

    if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&

        OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),

                                                     AMDGPU::OpName::src2))

      return false;

    return RI.opCanUseInlineConstant(OpInfo.OperandType);

  }


  return isLiteralOperandLegal(InstDesc, OpInfo);

}


bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,

                                    const MachineOperand &MO) const {

  if (MO.isImm())

    return isImmOperandLegal(InstDesc, OpNo, MO.getImm());


  assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&

         "unexpected imm-like operand kind");

  const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];

  return isLiteralOperandLegal(InstDesc, OpInfo);

}


bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {

  // 2 32-bit inline constants packed into one.

  return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&

         AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());

}


bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {

  // GFX90A does not have V_MUL_LEGACY_F32_e32.

  if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())

    return false;


  int Op32 = AMDGPU::getVOPe32(Opcode);

  if (Op32 == -1)

    return false;


  return pseudoToMCOpcode(Op32) != -1;

}


bool SIInstrInfo::hasModifiers(unsigned Opcode) const {

  // The src0_modifier operand is present on all instructions

  // that have modifiers.


  return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);

}


bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,

                                  AMDGPU::OpName OpName) const {

  const MachineOperand *Mods = getNamedOperand(MI, OpName);

  return Mods && Mods->getImm();

}


bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {

  return any_of(ModifierOpNames,

                [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });

}


bool SIInstrInfo::canShrink(const MachineInstr &MI,

                            const MachineRegisterInfo &MRI) const {

  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);

  // Can't shrink instruction with three operands.

  if (Src2) {

    switch (MI.getOpcode()) {

      default: return false;


      case AMDGPU::V_ADDC_U32_e64:

      case AMDGPU::V_SUBB_U32_e64:

      case AMDGPU::V_SUBBREV_U32_e64: {

        const MachineOperand *Src1

          = getNamedOperand(MI, AMDGPU::OpName::src1);

        if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))

          return false;

        // Additional verification is needed for sdst/src2.

        return true;

      }

      case AMDGPU::V_MAC_F16_e64:

      case AMDGPU::V_MAC_F32_e64:

      case AMDGPU::V_MAC_LEGACY_F32_e64:

      case AMDGPU::V_FMAC_F16_e64:

      case AMDGPU::V_FMAC_F16_t16_e64:

      case AMDGPU::V_FMAC_F16_fake16_e64:

      case AMDGPU::V_FMAC_F32_e64:

      case AMDGPU::V_FMAC_F64_e64:

      case AMDGPU::V_FMAC_LEGACY_F32_e64:

        if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||

            hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))

          return false;

        break;


      case AMDGPU::V_CNDMASK_B32_e64:

        break;

    }

  }


  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);

  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||

               hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))

    return false;


  // We don't need to check src0, all input types are legal, so just make sure

  // src0 isn't using any modifiers.

  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))

    return false;


  // Can it be shrunk to a valid 32 bit opcode?

  if (!hasVALU32BitEncoding(MI.getOpcode()))

    return false;


  // Check output modifiers

  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&

         !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&

         !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&

         // TODO: Can we avoid checking bound_ctrl/fi here?

         // They are only used by permlane*_swap special case.

         !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&

         !hasModifiersSet(MI, AMDGPU::OpName::fi);

}


// Set VCC operand with all flags from \p Orig, except for setting it as

// implicit.


static void copyFlagsToImplicitVCC(MachineInstr &MI,

                                   const MachineOperand &Orig) {


  for (MachineOperand &Use : MI.implicit_operands()) {

    if (Use.isUse() &&

        (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {

      Use.setIsUndef(Orig.isUndef());

      Use.setIsKill(Orig.isKill());

      return;

    }

  }

}


MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,

                                           unsigned Op32) const {

  MachineBasicBlock *MBB = MI.getParent();


  const MCInstrDesc &Op32Desc = get(Op32);

  MachineInstrBuilder Inst32 =

    BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)

    .setMIFlags(MI.getFlags());


  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.

  // For VOPC instructions, this is replaced by an implicit def of vcc.


  // We assume the defs of the shrunk opcode are in the same order, and the

  // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).

  for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)

    Inst32.add(MI.getOperand(I));


  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);


  int Idx = MI.getNumExplicitDefs();

  for (const MachineOperand &Use : MI.explicit_uses()) {

    int OpTy = MI.getDesc().operands()[Idx++].OperandType;

    if (OpTy == AMDGPU::OPERAND_INPUT_MODS || OpTy == MCOI::OPERAND_IMMEDIATE)

      continue;


    if (&Use == Src2) {

      if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {

        // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is

        // replaced with an implicit read of vcc or vcc_lo. The implicit read

        // of vcc was already added during the initial BuildMI, but we

        // 1) may need to change vcc to vcc_lo to preserve the original register

        // 2) have to preserve the original flags.

        copyFlagsToImplicitVCC(*Inst32, *Src2);

        continue;

      }

    }


    Inst32.add(Use);

  }


  // FIXME: Losing implicit operands

  fixImplicitOperands(*Inst32);

  return Inst32;

}


bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {

  // Null is free

  Register Reg = RegOp.getReg();

  if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)

    return false;


  // SGPRs use the constant bus


  // FIXME: implicit registers that are not part of the MCInstrDesc's implicit

  // physical register operands should also count, except for exec.

  if (RegOp.isImplicit())

    return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;


  // SGPRs use the constant bus

  return AMDGPU::SReg_32RegClass.contains(Reg) ||

         AMDGPU::SReg_64RegClass.contains(Reg);

}


bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,

                                     const MachineRegisterInfo &MRI) const {

  Register Reg = RegOp.getReg();

  return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))

                         : physRegUsesConstantBus(RegOp);

}


bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,

                                  const MachineOperand &MO,

                                  const MCOperandInfo &OpInfo) const {

  // Literal constants use the constant bus.

  if (!MO.isReg())

    return !isInlineConstant(MO, OpInfo);


  Register Reg = MO.getReg();

  return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))

                         : physRegUsesConstantBus(MO);

}


static Register findImplicitSGPRRead(const MachineInstr &MI) {

  for (const MachineOperand &MO : MI.implicit_operands()) {

    // We only care about reads.

    if (MO.isDef())

      continue;


    switch (MO.getReg()) {

    case AMDGPU::VCC:

    case AMDGPU::VCC_LO:

    case AMDGPU::VCC_HI:

    case AMDGPU::M0:

    case AMDGPU::FLAT_SCR:

      return MO.getReg();


    default:

      break;

    }

  }


  return Register();

}


static bool shouldReadExec(const MachineInstr &MI) {

  if (SIInstrInfo::isVALU(MI)) {

    switch (MI.getOpcode()) {

    case AMDGPU::V_READLANE_B32:

    case AMDGPU::SI_RESTORE_S32_FROM_VGPR:

    case AMDGPU::V_WRITELANE_B32:

    case AMDGPU::SI_SPILL_S32_TO_VGPR:

      return false;

    }


    return true;

  }


  if (MI.isPreISelOpcode() ||

      SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||

      SIInstrInfo::isSALU(MI) ||

      SIInstrInfo::isSMRD(MI))

    return false;


  return true;

}


static bool isRegOrFI(const MachineOperand &MO) {

  return MO.isReg() || MO.isFI();

}


static bool isSubRegOf(const SIRegisterInfo &TRI,

                       const MachineOperand &SuperVec,

                       const MachineOperand &SubReg) {

  if (SubReg.getReg().isPhysical())

    return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());


  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&

         SubReg.getReg() == SuperVec.getReg();

}


// Verify the illegal copy from vector register to SGPR for generic opcode COPY

bool SIInstrInfo::verifyCopy(const MachineInstr &MI,

                             const MachineRegisterInfo &MRI,

                             StringRef &ErrInfo) const {

  Register DstReg = MI.getOperand(0).getReg();

  Register SrcReg = MI.getOperand(1).getReg();

  // This is a check for copy from vector register to SGPR

  if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {

    ErrInfo = "illegal copy from vector register to SGPR";

    return false;

  }

  return true;

}


bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,

                                    StringRef &ErrInfo) const {

  uint16_t Opcode = MI.getOpcode();

  const MachineFunction *MF = MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF->getRegInfo();


  // FIXME: At this point the COPY verify is done only for non-ssa forms.

  // Find a better property to recognize the point where instruction selection

  // is just done.

  // We can only enforce this check after SIFixSGPRCopies pass so that the

  // illegal copies are legalized and thereafter we don't expect a pass

  // inserting similar copies.

  if (!MRI.isSSA() && MI.isCopy())

    return verifyCopy(MI, MRI, ErrInfo);


  if (SIInstrInfo::isGenericOpcode(Opcode))

    return true;


  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);

  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);

  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);

  int Src3Idx = -1;

  if (Src0Idx == -1) {

    // VOPD V_DUAL_* instructions use different operand names.

    Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);

    Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);

    Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);

    Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);

  }


  // Make sure the number of operands is correct.

  const MCInstrDesc &Desc = get(Opcode);

  if (!Desc.isVariadic() &&

      Desc.getNumOperands() != MI.getNumExplicitOperands()) {

    ErrInfo = "Instruction has wrong number of operands.";

    return false;

  }


  if (MI.isInlineAsm()) {

    // Verify register classes for inlineasm constraints.

    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();

         I != E; ++I) {

      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);

      if (!RC)

        continue;


      const MachineOperand &Op = MI.getOperand(I);

      if (!Op.isReg())

        continue;


      Register Reg = Op.getReg();

      if (!Reg.isVirtual() && !RC->contains(Reg)) {

        ErrInfo = "inlineasm operand has incorrect register class.";

        return false;

      }

    }


    return true;

  }


  if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {

    ErrInfo = "missing memory operand from image instruction.";

    return false;

  }


  // Make sure the register classes are correct.

  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {

    const MachineOperand &MO = MI.getOperand(i);

    if (MO.isFPImm()) {

      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "

                "all fp values to integers.";

      return false;

    }


    int RegClass = Desc.operands()[i].RegClass;


    const MCOperandInfo &OpInfo = Desc.operands()[i];

    switch (OpInfo.OperandType) {

    case MCOI::OPERAND_REGISTER:

      if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {

        ErrInfo = "Illegal immediate value for operand.";

        return false;

      }

      break;

    case AMDGPU::OPERAND_REG_IMM_INT32:

    case AMDGPU::OPERAND_REG_IMM_INT64:

    case AMDGPU::OPERAND_REG_IMM_INT16:

    case AMDGPU::OPERAND_REG_IMM_FP32:

    case AMDGPU::OPERAND_REG_IMM_V2FP32:

    case AMDGPU::OPERAND_REG_IMM_BF16:

    case AMDGPU::OPERAND_REG_IMM_FP16:

    case AMDGPU::OPERAND_REG_IMM_FP64:

    case AMDGPU::OPERAND_REG_IMM_V2FP16:

    case AMDGPU::OPERAND_REG_IMM_V2INT16:

    case AMDGPU::OPERAND_REG_IMM_V2INT32:

    case AMDGPU::OPERAND_REG_IMM_V2BF16:

      break;

    case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:

      break;

      break;

    case AMDGPU::OPERAND_REG_INLINE_C_INT16:

    case AMDGPU::OPERAND_REG_INLINE_C_INT32:

    case AMDGPU::OPERAND_REG_INLINE_C_INT64:

    case AMDGPU::OPERAND_REG_INLINE_C_FP32:

    case AMDGPU::OPERAND_REG_INLINE_C_FP64:

    case AMDGPU::OPERAND_REG_INLINE_C_BF16:

    case AMDGPU::OPERAND_REG_INLINE_C_FP16:

    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:

    case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:

    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:

    case AMDGPU::OPERAND_REG_INLINE_AC_INT32:

    case AMDGPU::OPERAND_REG_INLINE_AC_FP32:

    case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {

      if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {

        ErrInfo = "Illegal immediate value for operand.";

        return false;

      }

      break;

    }

    case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:

      if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {

        ErrInfo = "Expected inline constant for operand.";

        return false;

      }

      break;

    case AMDGPU::OPERAND_INPUT_MODS:

    case AMDGPU::OPERAND_SDWA_VOPC_DST:

    case AMDGPU::OPERAND_KIMM16:

      break;

    case MCOI::OPERAND_IMMEDIATE:

    case AMDGPU::OPERAND_KIMM32:

    case AMDGPU::OPERAND_KIMM64:

    case AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO:

      // Check if this operand is an immediate.

      // FrameIndex operands will be replaced by immediates, so they are

      // allowed.

      if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {

        ErrInfo = "Expected immediate, but got non-immediate";

        return false;

      }

      break;

    case MCOI::OPERAND_UNKNOWN:

    case MCOI::OPERAND_MEMORY:

    case MCOI::OPERAND_PCREL:

      break;

    default:

      if (OpInfo.isGenericType())

        continue;

      break;

    }


    if (!MO.isReg())

      continue;

    Register Reg = MO.getReg();

    if (!Reg)

      continue;


    // FIXME: Ideally we would have separate instruction definitions with the

    // aligned register constraint.

    // FIXME: We do not verify inline asm operands, but custom inline asm

    // verification is broken anyway

    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {

      const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);

      if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {

        if (const TargetRegisterClass *SubRC =

                RI.getSubRegisterClass(RC, MO.getSubReg())) {

          RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());

          if (RC)

            RC = SubRC;

        }

      }


      // Check that this is the aligned version of the class.

      if (!RC || !RI.isProperlyAlignedRC(*RC)) {

        ErrInfo = "Subtarget requires even aligned vector registers";

        return false;

      }

    }


    if (RegClass != -1) {

      if (Reg.isVirtual())

        continue;


      const TargetRegisterClass *RC = RI.getRegClass(RegClass);

      if (!RC->contains(Reg)) {

        ErrInfo = "Operand has incorrect register class.";

        return false;

      }

    }

  }


  // Verify SDWA

  if (isSDWA(MI)) {

    if (!ST.hasSDWA()) {

      ErrInfo = "SDWA is not supported on this target";

      return false;

    }


    for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,

                    AMDGPU::OpName::dst_sel}) {

      const MachineOperand *MO = getNamedOperand(MI, Op);

      if (!MO)

        continue;

      int64_t Imm = MO->getImm();

      if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {

        ErrInfo = "Invalid SDWA selection";

        return false;

      }

    }


    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);


    for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {

      if (OpIdx == -1)

        continue;

      const MachineOperand &MO = MI.getOperand(OpIdx);


      if (!ST.hasSDWAScalar()) {

        // Only VGPRS on VI

        if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {

          ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";

          return false;

        }

      } else {

        // No immediates on GFX9

        if (!MO.isReg()) {

          ErrInfo =

            "Only reg allowed as operands in SDWA instructions on GFX9+";

          return false;

        }

      }

    }


    if (!ST.hasSDWAOmod()) {

      // No omod allowed on VI

      const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);

      if (OMod != nullptr &&

        (!OMod->isImm() || OMod->getImm() != 0)) {

        ErrInfo = "OMod not allowed in SDWA instructions on VI";

        return false;

      }

    }


    if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||

        Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||

        Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||

        Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {

      const MachineOperand *Src0ModsMO =

          getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);

      unsigned Mods = Src0ModsMO->getImm();

      if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||

          Mods & SISrcMods::SEXT) {

        ErrInfo = "sext, abs and neg are not allowed on this instruction";

        return false;

      }

    }


    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);

    if (isVOPC(BasicOpcode)) {

      if (!ST.hasSDWASdst() && DstIdx != -1) {

        // Only vcc allowed as dst on VI for VOPC

        const MachineOperand &Dst = MI.getOperand(DstIdx);

        if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {

          ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";

          return false;

        }

      } else if (!ST.hasSDWAOutModsVOPC()) {

        // No clamp allowed on GFX9 for VOPC

        const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);

        if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {

          ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";

          return false;

        }


        // No omod allowed on GFX9 for VOPC

        const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);

        if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {

          ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";

          return false;

        }

      }

    }


    const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);

    if (DstUnused && DstUnused->isImm() &&

        DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {

      const MachineOperand &Dst = MI.getOperand(DstIdx);

      if (!Dst.isReg() || !Dst.isTied()) {

        ErrInfo = "Dst register should have tied register";

        return false;

      }


      const MachineOperand &TiedMO =

          MI.getOperand(MI.findTiedOperandIdx(DstIdx));

      if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {

        ErrInfo =

            "Dst register should be tied to implicit use of preserved register";

        return false;

      }

      if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {

        ErrInfo = "Dst register should use same physical register as preserved";

        return false;

      }

    }

  }


  // Verify MIMG / VIMAGE / VSAMPLE

  if (isImage(Opcode) && !MI.mayStore()) {

    // Ensure that the return type used is large enough for all the options

    // being used TFE/LWE require an extra result register.

    const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);

    if (DMask) {

      uint64_t DMaskImm = DMask->getImm();

      uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);

      const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);

      const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);

      const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);


      // Adjust for packed 16 bit values

      if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())

        RegCount = divideCeil(RegCount, 2);


      // Adjust if using LWE or TFE

      if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))

        RegCount += 1;


      const uint32_t DstIdx =

          AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);

      const MachineOperand &Dst = MI.getOperand(DstIdx);

      if (Dst.isReg()) {

        const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);

        uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;

        if (RegCount > DstSize) {

          ErrInfo = "Image instruction returns too many registers for dst "

                    "register class";

          return false;

        }

      }

    }

  }


  // Verify VOP*. Ignore multiple sgpr operands on writelane.

  if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {

    unsigned ConstantBusCount = 0;

    bool UsesLiteral = false;

    const MachineOperand *LiteralVal = nullptr;


    int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);

    if (ImmIdx != -1) {

      ++ConstantBusCount;

      UsesLiteral = true;

      LiteralVal = &MI.getOperand(ImmIdx);

    }


    SmallVector<Register, 2> SGPRsUsed;

    Register SGPRUsed;


    // Only look at the true operands. Only a real operand can use the constant

    // bus, and we don't want to check pseudo-operands like the source modifier

    // flags.

    for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {

      if (OpIdx == -1)

        continue;

      const MachineOperand &MO = MI.getOperand(OpIdx);

      if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {

        if (MO.isReg()) {

          SGPRUsed = MO.getReg();

          if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {

            ++ConstantBusCount;

            SGPRsUsed.push_back(SGPRUsed);

          }

        } else if (!MO.isFI()) { // Treat FI like a register.

          if (!UsesLiteral) {

            ++ConstantBusCount;

            UsesLiteral = true;

            LiteralVal = &MO;

          } else if (!MO.isIdenticalTo(*LiteralVal)) {

            assert(isVOP2(MI) || isVOP3(MI));

            ErrInfo = "VOP2/VOP3 instruction uses more than one literal";

            return false;

          }

        }

      }

    }


    SGPRUsed = findImplicitSGPRRead(MI);

    if (SGPRUsed) {

      // Implicit uses may safely overlap true operands

      if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {

            return !RI.regsOverlap(SGPRUsed, SGPR);

          })) {

        ++ConstantBusCount;

        SGPRsUsed.push_back(SGPRUsed);

      }

    }


    // v_writelane_b32 is an exception from constant bus restriction:

    // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const

    if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&

        Opcode != AMDGPU::V_WRITELANE_B32) {

      ErrInfo = "VOP* instruction violates constant bus restriction";

      return false;

    }


    if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {

      ErrInfo = "VOP3 instruction uses literal";

      return false;

    }

  }


  // Special case for writelane - this can break the multiple constant bus rule,

  // but still can't use more than one SGPR register

  if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {

    unsigned SGPRCount = 0;

    Register SGPRUsed;


    for (int OpIdx : {Src0Idx, Src1Idx}) {

      if (OpIdx == -1)

        break;


      const MachineOperand &MO = MI.getOperand(OpIdx);


      if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {

        if (MO.isReg() && MO.getReg() != AMDGPU::M0) {

          if (MO.getReg() != SGPRUsed)

            ++SGPRCount;

          SGPRUsed = MO.getReg();

        }

      }

      if (SGPRCount > ST.getConstantBusLimit(Opcode)) {

        ErrInfo = "WRITELANE instruction violates constant bus restriction";

        return false;

      }

    }

  }


  // Verify misc. restrictions on specific instructions.

  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||

      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {

    const MachineOperand &Src0 = MI.getOperand(Src0Idx);

    const MachineOperand &Src1 = MI.getOperand(Src1Idx);

    const MachineOperand &Src2 = MI.getOperand(Src2Idx);

    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {

      if (!compareMachineOp(Src0, Src1) &&

          !compareMachineOp(Src0, Src2)) {

        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";

        return false;

      }

    }

    if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &

         SISrcMods::ABS) ||

        (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &

         SISrcMods::ABS) ||

        (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &

         SISrcMods::ABS)) {

      ErrInfo = "ABS not allowed in VOP3B instructions";

      return false;

    }

  }


  if (isSOP2(MI) || isSOPC(MI)) {

    const MachineOperand &Src0 = MI.getOperand(Src0Idx);

    const MachineOperand &Src1 = MI.getOperand(Src1Idx);


    if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&

        !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&

        !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&

        !Src0.isIdenticalTo(Src1)) {

      ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";

      return false;

    }

  }


  if (isSOPK(MI)) {

    const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);

    if (Desc.isBranch()) {

      if (!Op->isMBB()) {

        ErrInfo = "invalid branch target for SOPK instruction";

        return false;

      }

    } else {

      uint64_t Imm = Op->getImm();

      if (sopkIsZext(Opcode)) {

        if (!isUInt<16>(Imm)) {

          ErrInfo = "invalid immediate for SOPK instruction";

          return false;

        }

      } else {

        if (!isInt<16>(Imm)) {

          ErrInfo = "invalid immediate for SOPK instruction";

          return false;

        }

      }

    }

  }


  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||

      Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||

      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||

      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {

    const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||

                       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;


    const unsigned StaticNumOps =

        Desc.getNumOperands() + Desc.implicit_uses().size();

    const unsigned NumImplicitOps = IsDst ? 2 : 1;


    // Allow additional implicit operands. This allows a fixup done by the post

    // RA scheduler where the main implicit operand is killed and implicit-defs

    // are added for sub-registers that remain live after this instruction.

    if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {

      ErrInfo = "missing implicit register operands";

      return false;

    }


    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);

    if (IsDst) {

      if (!Dst->isUse()) {

        ErrInfo = "v_movreld_b32 vdst should be a use operand";

        return false;

      }


      unsigned UseOpIdx;

      if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||

          UseOpIdx != StaticNumOps + 1) {

        ErrInfo = "movrel implicit operands should be tied";

        return false;

      }

    }


    const MachineOperand &Src0 = MI.getOperand(Src0Idx);

    const MachineOperand &ImpUse

      = MI.getOperand(StaticNumOps + NumImplicitOps - 1);

    if (!ImpUse.isReg() || !ImpUse.isUse() ||

        !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {

      ErrInfo = "src0 should be subreg of implicit vector use";

      return false;

    }

  }


  // Make sure we aren't losing exec uses in the td files. This mostly requires

  // being careful when using let Uses to try to add other use registers.

  if (shouldReadExec(MI)) {

    if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {

      ErrInfo = "VALU instruction does not implicitly read exec mask";

      return false;

    }

  }


  if (isSMRD(MI)) {

    if (MI.mayStore() &&

        ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {

      // The register offset form of scalar stores may only use m0 as the

      // soffset register.

      const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);

      if (Soff && Soff->getReg() != AMDGPU::M0) {

        ErrInfo = "scalar stores must use m0 as offset register";

        return false;

      }

    }

  }


  if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {

    const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);

    if (Offset->getImm() != 0) {

      ErrInfo = "subtarget does not support offsets in flat instructions";

      return false;

    }

  }


  if (isDS(MI) && !ST.hasGDS()) {

    const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);

    if (GDSOp && GDSOp->getImm() != 0) {

      ErrInfo = "GDS is not supported on this subtarget";

      return false;

    }

  }


  if (isImage(MI)) {

    const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);

    if (DimOp) {

      int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,

                                                 AMDGPU::OpName::vaddr0);

      AMDGPU::OpName RSrcOpName =

          isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;

      int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);

      const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);

      const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =

          AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);

      const AMDGPU::MIMGDimInfo *Dim =

          AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());


      if (!Dim) {

        ErrInfo = "dim is out of range";

        return false;

      }


      bool IsA16 = false;

      if (ST.hasR128A16()) {

        const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);

        IsA16 = R128A16->getImm() != 0;

      } else if (ST.hasA16()) {

        const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);

        IsA16 = A16->getImm() != 0;

      }


      bool IsNSA = RsrcIdx - VAddr0Idx > 1;


      unsigned AddrWords =

          AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());


      unsigned VAddrWords;

      if (IsNSA) {

        VAddrWords = RsrcIdx - VAddr0Idx;

        if (ST.hasPartialNSAEncoding() &&

            AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {

          unsigned LastVAddrIdx = RsrcIdx - 1;

          VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;

        }

      } else {

        VAddrWords = getOpSize(MI, VAddr0Idx) / 4;

        if (AddrWords > 12)

          AddrWords = 16;

      }


      if (VAddrWords != AddrWords) {

        LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords

                          << " but got " << VAddrWords << "\n");

        ErrInfo = "bad vaddr size";

        return false;

      }

    }

  }


  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);

  if (DppCt) {

    using namespace AMDGPU::DPP;


    unsigned DC = DppCt->getImm();

    if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||

        DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||

        (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||

        (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||

        (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||

        (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||

        (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {

      ErrInfo = "Invalid dpp_ctrl value";

      return false;

    }

    if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&

        ST.getGeneration() >= AMDGPUSubtarget::GFX10) {

      ErrInfo = "Invalid dpp_ctrl value: "

                "wavefront shifts are not supported on GFX10+";

      return false;

    }

    if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&

        ST.getGeneration() >= AMDGPUSubtarget::GFX10) {

      ErrInfo = "Invalid dpp_ctrl value: "

                "broadcasts are not supported on GFX10+";

      return false;

    }

    if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&

        ST.getGeneration() < AMDGPUSubtarget::GFX10) {

      if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&

          DC <= DppCtrl::ROW_NEWBCAST_LAST &&

          !ST.hasGFX90AInsts()) {

        ErrInfo = "Invalid dpp_ctrl value: "

                  "row_newbroadcast/row_share is not supported before "

                  "GFX90A/GFX10";

        return false;

      }

      if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {

        ErrInfo = "Invalid dpp_ctrl value: "

                  "row_share and row_xmask are not supported before GFX10";

        return false;

      }

    }


    if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&

        !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&

        AMDGPU::isDPALU_DPP(Desc, ST)) {

      ErrInfo = "Invalid dpp_ctrl value: "

                "DP ALU dpp only support row_newbcast";

      return false;

    }

  }


  if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {

    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);

    AMDGPU::OpName DataName =

        isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;

    const MachineOperand *Data = getNamedOperand(MI, DataName);

    const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);

    if (Data && !Data->isReg())

      Data = nullptr;


    if (ST.hasGFX90AInsts()) {

      if (Dst && Data && !Dst->isTied() && !Data->isTied() &&

          (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {

        ErrInfo = "Invalid register class: "

                  "vdata and vdst should be both VGPR or AGPR";

        return false;

      }

      if (Data && Data2 &&

          (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {

        ErrInfo = "Invalid register class: "

                  "both data operands should be VGPR or AGPR";

        return false;

      }

    } else {

      if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||

          (Data && RI.isAGPR(MRI, Data->getReg())) ||

          (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {

        ErrInfo = "Invalid register class: "

                  "agpr loads and stores not supported on this GPU";

        return false;

      }

    }

  }


  if (ST.needsAlignedVGPRs()) {

    const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {

      const MachineOperand *Op = getNamedOperand(MI, OpName);

      if (!Op)

        return true;

      Register Reg = Op->getReg();

      if (Reg.isPhysical())

        return !(RI.getHWRegIndex(Reg) & 1);

      const TargetRegisterClass &RC = *MRI.getRegClass(Reg);

      return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&

             !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);

    };


    if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||

        Opcode == AMDGPU::DS_GWS_BARRIER) {


      if (!isAlignedReg(AMDGPU::OpName::data0)) {

        ErrInfo = "Subtarget requires even aligned vector registers "

                  "for DS_GWS instructions";

        return false;

      }

    }


    if (isMIMG(MI)) {

      if (!isAlignedReg(AMDGPU::OpName::vaddr)) {

        ErrInfo = "Subtarget requires even aligned vector registers "

                  "for vaddr operand of image instructions";

        return false;

      }

    }

  }


  if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {

    const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);

    if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {

      ErrInfo = "Invalid register class: "

                "v_accvgpr_write with an SGPR is not supported on this GPU";

      return false;

    }

  }


  if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {

    const MachineOperand &SrcOp = MI.getOperand(1);

    if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {

      ErrInfo = "pseudo expects only physical SGPRs";

      return false;

    }

  }


  if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {

    if (CPol->getImm() & AMDGPU::CPol::SCAL) {

      if (!ST.hasScaleOffset()) {

        ErrInfo = "Subtarget does not support offset scaling";

        return false;

      }

      if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {

        ErrInfo = "Instruction does not support offset scaling";

        return false;

      }

    }

  }


  // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more

  // information.

  if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {

    for (unsigned I = 0; I < 3; ++I) {

      if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I))

        return false;

    }

  }


  return true;

}


// It is more readable to list mapped opcodes on the same line.

// clang-format off


unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {

  switch (MI.getOpcode()) {

  default: return AMDGPU::INSTRUCTION_LIST_END;

  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;

  case AMDGPU::COPY: return AMDGPU::COPY;

  case AMDGPU::PHI: return AMDGPU::PHI;

  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;

  case AMDGPU::WQM: return AMDGPU::WQM;

  case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;

  case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;

  case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;

  case AMDGPU::S_MOV_B32: {

    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

    return MI.getOperand(1).isReg() ||

           RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?

           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;

  }

  case AMDGPU::S_ADD_I32:

    return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;

  case AMDGPU::S_ADDC_U32:

    return AMDGPU::V_ADDC_U32_e32;

  case AMDGPU::S_SUB_I32:

    return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;

    // FIXME: These are not consistently handled, and selected when the carry is

    // used.

  case AMDGPU::S_ADD_U32:

    return AMDGPU::V_ADD_CO_U32_e32;

  case AMDGPU::S_SUB_U32:

    return AMDGPU::V_SUB_CO_U32_e32;

  case AMDGPU::S_ADD_U64_PSEUDO:

    return AMDGPU::V_ADD_U64_PSEUDO;

  case AMDGPU::S_SUB_U64_PSEUDO:

    return AMDGPU::V_SUB_U64_PSEUDO;

  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;

  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;

  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;

  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;

  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;

  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;

  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;

  case AMDGPU::S_XNOR_B32:

    return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;

  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;

  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;

  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;

  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;

  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;

  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;

  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;

  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;

  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;

  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;

  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;

  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;

  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;

  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;

  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;

  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;

  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;

  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;

  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;

  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;

  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;

  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;

  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;

  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;

  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;

  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;

  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;

  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;

  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;

  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;

  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;

  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;

  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;

  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;

  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;

  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;

  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;

  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;

  case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;

  case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;

  case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;

  case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;

  case AMDGPU::S_CVT_F32_F16:

  case AMDGPU::S_CVT_HI_F32_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64

                                   : AMDGPU::V_CVT_F32_F16_fake16_e64;

  case AMDGPU::S_CVT_F16_F32:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64

                                   : AMDGPU::V_CVT_F16_F32_fake16_e64;

  case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;

  case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;

  case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;

  case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;

  case AMDGPU::S_CEIL_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64

                                   : AMDGPU::V_CEIL_F16_fake16_e64;

  case AMDGPU::S_FLOOR_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64

                                   : AMDGPU::V_FLOOR_F16_fake16_e64;

  case AMDGPU::S_TRUNC_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64

                                   : AMDGPU::V_TRUNC_F16_fake16_e64;

  case AMDGPU::S_RNDNE_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64

                                   : AMDGPU::V_RNDNE_F16_fake16_e64;

  case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;

  case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;

  case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;

  case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;

  case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;

  case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;

  case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;

  case AMDGPU::S_ADD_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64

                                   : AMDGPU::V_ADD_F16_fake16_e64;

  case AMDGPU::S_SUB_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64

                                   : AMDGPU::V_SUB_F16_fake16_e64;

  case AMDGPU::S_MIN_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64

                                   : AMDGPU::V_MIN_F16_fake16_e64;

  case AMDGPU::S_MAX_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64

                                   : AMDGPU::V_MAX_F16_fake16_e64;

  case AMDGPU::S_MINIMUM_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64

                                   : AMDGPU::V_MINIMUM_F16_fake16_e64;

  case AMDGPU::S_MAXIMUM_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64

                                   : AMDGPU::V_MAXIMUM_F16_fake16_e64;

  case AMDGPU::S_MUL_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64

                                   : AMDGPU::V_MUL_F16_fake16_e64;

  case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;

  case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;

  case AMDGPU::S_FMAC_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64

                                   : AMDGPU::V_FMAC_F16_fake16_e64;

  case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;

  case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;

  case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;

  case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;

  case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;

  case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;

  case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;

  case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;

  case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;

  case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;

  case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;

  case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;

  case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;

  case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;

  case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;

  case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;

  case AMDGPU::S_CMP_LT_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64

                                   : AMDGPU::V_CMP_LT_F16_fake16_e64;

  case AMDGPU::S_CMP_EQ_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64

                                   : AMDGPU::V_CMP_EQ_F16_fake16_e64;

  case AMDGPU::S_CMP_LE_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64

                                   : AMDGPU::V_CMP_LE_F16_fake16_e64;

  case AMDGPU::S_CMP_GT_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64

                                   : AMDGPU::V_CMP_GT_F16_fake16_e64;

  case AMDGPU::S_CMP_LG_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64

                                   : AMDGPU::V_CMP_LG_F16_fake16_e64;

  case AMDGPU::S_CMP_GE_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64

                                   : AMDGPU::V_CMP_GE_F16_fake16_e64;

  case AMDGPU::S_CMP_O_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64

                                   : AMDGPU::V_CMP_O_F16_fake16_e64;

  case AMDGPU::S_CMP_U_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64

                                   : AMDGPU::V_CMP_U_F16_fake16_e64;

  case AMDGPU::S_CMP_NGE_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64

                                   : AMDGPU::V_CMP_NGE_F16_fake16_e64;

  case AMDGPU::S_CMP_NLG_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64

                                   : AMDGPU::V_CMP_NLG_F16_fake16_e64;

  case AMDGPU::S_CMP_NGT_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64

                                   : AMDGPU::V_CMP_NGT_F16_fake16_e64;

  case AMDGPU::S_CMP_NLE_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64

                                   : AMDGPU::V_CMP_NLE_F16_fake16_e64;

  case AMDGPU::S_CMP_NEQ_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64

                                   : AMDGPU::V_CMP_NEQ_F16_fake16_e64;

  case AMDGPU::S_CMP_NLT_F16:

    return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64

                                   : AMDGPU::V_CMP_NLT_F16_fake16_e64;

  case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;

  case AMDGPU::V_S_EXP_F16_e64:

    return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64

                                   : AMDGPU::V_EXP_F16_fake16_e64;

  case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;

  case AMDGPU::V_S_LOG_F16_e64:

    return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64

                                   : AMDGPU::V_LOG_F16_fake16_e64;

  case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;

  case AMDGPU::V_S_RCP_F16_e64:

    return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64

                                   : AMDGPU::V_RCP_F16_fake16_e64;

  case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;

  case AMDGPU::V_S_RSQ_F16_e64:

    return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64

                                   : AMDGPU::V_RSQ_F16_fake16_e64;

  case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;

  case AMDGPU::V_S_SQRT_F16_e64:

    return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64

                                   : AMDGPU::V_SQRT_F16_fake16_e64;

  }

  llvm_unreachable(

      "Unexpected scalar opcode without corresponding vector one!");

}


// clang-format on


void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,

                                        MachineBasicBlock &MBB,

                                        MachineBasicBlock::iterator MBBI,

                                        const DebugLoc &DL, Register Reg,

                                        bool IsSCCLive,

                                        SlotIndexes *Indexes) const {

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

  const SIInstrInfo *TII = ST.getInstrInfo();

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  if (IsSCCLive) {

    // Insert two move instructions, one to save the original value of EXEC and

    // the other to turn on all bits in EXEC. This is required as we can't use

    // the single instruction S_OR_SAVEEXEC that clobbers SCC.

    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)

                           .addReg(LMC.ExecReg, RegState::Kill);

    auto FlipExecMI =

        BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);

    if (Indexes) {

      Indexes->insertMachineInstrInMaps(*StoreExecMI);

      Indexes->insertMachineInstrInMaps(*FlipExecMI);

    }

  } else {

    auto SaveExec =

        BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);

    SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.

    if (Indexes)

      Indexes->insertMachineInstrInMaps(*SaveExec);

  }

}


void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,

                              MachineBasicBlock::iterator MBBI,

                              const DebugLoc &DL, Register Reg,

                              SlotIndexes *Indexes) const {

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)

                           .addReg(Reg, RegState::Kill);

  if (Indexes)

    Indexes->insertMachineInstrInMaps(*ExecRestoreMI);

}


MachineInstr *


SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {

  assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&

         "Not a whole wave func");

  MachineBasicBlock &MBB = *MF.begin();

  for (MachineInstr &MI : MBB)

    if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||

        MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)

      return &MI;


  llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");

}


static const TargetRegisterClass *


adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,

                          const MCInstrDesc &TID, unsigned RCID) {

  if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {

    switch (RCID) {

    case AMDGPU::AV_32RegClassID:

      RCID = AMDGPU::VGPR_32RegClassID;

      break;

    case AMDGPU::AV_64RegClassID:

      RCID = AMDGPU::VReg_64RegClassID;

      break;

    case AMDGPU::AV_96RegClassID:

      RCID = AMDGPU::VReg_96RegClassID;

      break;

    case AMDGPU::AV_128RegClassID:

      RCID = AMDGPU::VReg_128RegClassID;

      break;

    case AMDGPU::AV_160RegClassID:

      RCID = AMDGPU::VReg_160RegClassID;

      break;

    case AMDGPU::AV_512RegClassID:

      RCID = AMDGPU::VReg_512RegClassID;

      break;

    default:

      break;

    }

  }


  return RI.getProperlyAlignedRC(RI.getRegClass(RCID));

}


const TargetRegisterClass *


SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,

                         const TargetRegisterInfo *TRI) const {

  if (OpNum >= TID.getNumOperands())

    return nullptr;

  auto RegClass = TID.operands()[OpNum].RegClass;

  // Special pseudos have no alignment requirement.

  if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))

    return RI.getRegClass(RegClass);


  return adjustAllocatableRegClass(ST, RI, TID, RegClass);

}


const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,

                                                      unsigned OpNo) const {

  const MCInstrDesc &Desc = get(MI.getOpcode());

  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||

      Desc.operands()[OpNo].RegClass == -1) {

    Register Reg = MI.getOperand(OpNo).getReg();


    if (Reg.isVirtual()) {

      const MachineRegisterInfo &MRI =

          MI.getParent()->getParent()->getRegInfo();

      return MRI.getRegClass(Reg);

    }

    return RI.getPhysRegBaseClass(Reg);

  }


  unsigned RCID = Desc.operands()[OpNo].RegClass;

  return adjustAllocatableRegClass(ST, RI, Desc, RCID);

}


void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {

  MachineBasicBlock::iterator I = MI;

  MachineBasicBlock *MBB = MI.getParent();

  MachineOperand &MO = MI.getOperand(OpIdx);

  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

  unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;

  const TargetRegisterClass *RC = RI.getRegClass(RCID);

  unsigned Size = RI.getRegSizeInBits(*RC);

  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO

                    : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64

                                 : AMDGPU::V_MOV_B32_e32;

  if (MO.isReg())

    Opcode = AMDGPU::COPY;

  else if (RI.isSGPRClass(RC))

    Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;


  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);

  Register Reg = MRI.createVirtualRegister(VRC);

  DebugLoc DL = MBB->findDebugLoc(I);

  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);

  MO.ChangeToRegister(Reg, false);

}


unsigned SIInstrInfo::buildExtractSubReg(

    MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,

    const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,

    unsigned SubIdx, const TargetRegisterClass *SubRC) const {

  if (!SuperReg.getReg().isVirtual())

    return RI.getSubReg(SuperReg.getReg(), SubIdx);


  MachineBasicBlock *MBB = MI->getParent();

  const DebugLoc &DL = MI->getDebugLoc();

  Register SubReg = MRI.createVirtualRegister(SubRC);


  unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);

  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)

      .addReg(SuperReg.getReg(), 0, NewSubIdx);

  return SubReg;

}


MachineOperand SIInstrInfo::buildExtractSubRegOrImm(

    MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,

    const MachineOperand &Op, const TargetRegisterClass *SuperRC,

    unsigned SubIdx, const TargetRegisterClass *SubRC) const {

  if (Op.isImm()) {

    if (SubIdx == AMDGPU::sub0)

      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));

    if (SubIdx == AMDGPU::sub1)

      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));


    llvm_unreachable("Unhandled register index for immediate");

  }


  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,

                                       SubIdx, SubRC);

  return MachineOperand::CreateReg(SubReg, false);

}


// Change the order of operands from (0, 1, 2) to (0, 2, 1)

void SIInstrInfo::swapOperands(MachineInstr &Inst) const {

  assert(Inst.getNumExplicitOperands() == 3);

  MachineOperand Op1 = Inst.getOperand(1);

  Inst.removeOperand(1);

  Inst.addOperand(Op1);

}


bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,

                                    const MCOperandInfo &OpInfo,

                                    const MachineOperand &MO) const {

  if (!MO.isReg())

    return false;


  Register Reg = MO.getReg();


  const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);

  if (Reg.isPhysical())

    return DRC->contains(Reg);


  const TargetRegisterClass *RC = MRI.getRegClass(Reg);


  if (MO.getSubReg()) {

    const MachineFunction *MF = MO.getParent()->getParent()->getParent();

    const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);

    if (!SuperRC)

      return false;

    return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;

  }


  return RI.getCommonSubClass(DRC, RC) != nullptr;

}


bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,

                                    const MachineOperand &MO) const {

  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

  const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];

  unsigned Opc = MI.getOpcode();


  // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more

  // information.

  if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&

      MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {

    constexpr const AMDGPU::OpName OpNames[] = {

        AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};


    for (auto [I, OpName] : enumerate(OpNames)) {

      int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);

      if (static_cast<unsigned>(SrcIdx) == OpIdx &&

          !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO))

        return false;

    }

  }


  if (!isLegalRegOperand(MRI, OpInfo, MO))

    return false;


  // check Accumulate GPR operand

  bool IsAGPR = RI.isAGPR(MRI, MO.getReg());

  if (IsAGPR && !ST.hasMAIInsts())

    return false;

  if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&

      (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))

    return false;

  // Atomics should have both vdst and vdata either vgpr or agpr.

  const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);

  const int DataIdx = AMDGPU::getNamedOperandIdx(

      Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);

  if ((int)OpIdx == VDstIdx && DataIdx != -1 &&

      MI.getOperand(DataIdx).isReg() &&

      RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)

    return false;

  if ((int)OpIdx == DataIdx) {

    if (VDstIdx != -1 &&

        RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)

      return false;

    // DS instructions with 2 src operands also must have tied RC.

    const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);

    if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&

        RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)

      return false;

  }


  // Check V_ACCVGPR_WRITE_B32_e64

  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&

      (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&

      RI.isSGPRReg(MRI, MO.getReg()))

    return false;

  return true;

}


bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,

                                     const MCOperandInfo &OpInfo,

                                     const MachineOperand &MO) const {

  if (MO.isReg())

    return isLegalRegOperand(MRI, OpInfo, MO);


  // Handle non-register types that are treated like immediates.

  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());

  return true;

}


bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(

    const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,

    const MachineOperand *MO) const {

  constexpr const unsigned NumOps = 3;

  constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {

      AMDGPU::OpName::src0,           AMDGPU::OpName::src1,

      AMDGPU::OpName::src2,           AMDGPU::OpName::src0_modifiers,

      AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};


  assert(SrcN < NumOps);


  if (!MO) {

    int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);

    if (SrcIdx == -1)

      return true;

    MO = &MI.getOperand(SrcIdx);

  }


  if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))

    return true;


  int ModsIdx =

      AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);

  if (ModsIdx == -1)

    return true;


  unsigned Mods = MI.getOperand(ModsIdx).getImm();

  bool OpSel = Mods & SISrcMods::OP_SEL_0;

  bool OpSelHi = Mods & SISrcMods::OP_SEL_1;


  return !OpSel && !OpSelHi;

}


bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,

                                 const MachineOperand *MO) const {

  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  const MCInstrDesc &InstDesc = MI.getDesc();

  const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];

  const TargetRegisterClass *DefinedRC =

      OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;

  if (!MO)

    MO = &MI.getOperand(OpIdx);


  const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);


  if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {

    const MachineOperand *UsedLiteral = nullptr;


    int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());

    int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;


    // TODO: Be more permissive with frame indexes.

    if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {

      if (!LiteralLimit--)

        return false;


      UsedLiteral = MO;

    }


    SmallDenseSet<RegSubRegPair> SGPRsUsed;

    if (MO->isReg())

      SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));


    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

      if (i == OpIdx)

        continue;

      const MachineOperand &Op = MI.getOperand(i);

      if (Op.isReg()) {

        if (Op.isUse()) {

          RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());

          if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {

            if (--ConstantBusLimit <= 0)

              return false;

          }

        }

      } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&

                 !isInlineConstant(Op, InstDesc.operands()[i])) {

        // The same literal may be used multiple times.

        if (!UsedLiteral)

          UsedLiteral = &Op;

        else if (UsedLiteral->isIdenticalTo(Op))

          continue;


        if (!LiteralLimit--)

          return false;

        if (--ConstantBusLimit <= 0)

          return false;

      }

    }

  } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {

    // There can be at most one literal operand, but it can be repeated.

    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

      if (i == OpIdx)

        continue;

      const MachineOperand &Op = MI.getOperand(i);

      if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&

          !isInlineConstant(Op, InstDesc.operands()[i]) &&

          !Op.isIdenticalTo(*MO))

        return false;


      // Do not fold a non-inlineable and non-register operand into an

      // instruction that already has a frame index. The frame index handling

      // code could not handle well when a frame index co-exists with another

      // non-register operand, unless that operand is an inlineable immediate.

      if (Op.isFI())

        return false;

    }

  } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&

             isF16PseudoScalarTrans(MI.getOpcode())) {

    return false;

  }


  if (MO->isReg()) {

    if (!DefinedRC)

      return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;

    return isLegalRegOperand(MI, OpIdx, *MO);

  }


  if (MO->isImm()) {

    uint64_t Imm = MO->getImm();

    bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;

    bool Is64BitOp = Is64BitFPOp ||

                     OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||

                     OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||

                     OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;

    if (Is64BitOp &&

        !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {

      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&

          (!ST.has64BitLiterals() || InstDesc.getSize() != 4))

        return false;


      // FIXME: We can use sign extended 64-bit literals, but only for signed

      //        operands. At the moment we do not know if an operand is signed.

      //        Such operand will be encoded as its low 32 bits and then either

      //        correctly sign extended or incorrectly zero extended by HW.

      //        If 64-bit literals are supported and the literal will be encoded

      //        as full 64 bit we still can use it.

      if (!Is64BitFPOp && (int32_t)Imm < 0 &&

          (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))

        return false;

    }

  }


  // Handle non-register types that are treated like immediates.

  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());


  if (!DefinedRC) {

    // This operand expects an immediate.

    return true;

  }


  return isImmOperandLegal(MI, OpIdx, *MO);

}


void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,

                                       MachineInstr &MI) const {

  unsigned Opc = MI.getOpcode();

  const MCInstrDesc &InstrDesc = get(Opc);


  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);

  MachineOperand &Src0 = MI.getOperand(Src0Idx);


  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);

  MachineOperand &Src1 = MI.getOperand(Src1Idx);


  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32

  // we need to only have one constant bus use before GFX10.

  bool HasImplicitSGPR = findImplicitSGPRRead(MI);

  if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&

      RI.isSGPRReg(MRI, Src0.getReg()))

    legalizeOpWithMove(MI, Src0Idx);


  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for

  // both the value to write (src0) and lane select (src1).  Fix up non-SGPR

  // src0/src1 with V_READFIRSTLANE.

  if (Opc == AMDGPU::V_WRITELANE_B32) {

    const DebugLoc &DL = MI.getDebugLoc();

    if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {

      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)

          .add(Src0);

      Src0.ChangeToRegister(Reg, false);

    }

    if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {

      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

      const DebugLoc &DL = MI.getDebugLoc();

      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)

          .add(Src1);

      Src1.ChangeToRegister(Reg, false);

    }

    return;

  }


  // No VOP2 instructions support AGPRs.

  if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))

    legalizeOpWithMove(MI, Src0Idx);


  if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))

    legalizeOpWithMove(MI, Src1Idx);


  // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.

  if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {

    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);

    if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))

      legalizeOpWithMove(MI, Src2Idx);

  }


  // VOP2 src0 instructions support all operand types, so we don't need to check

  // their legality. If src1 is already legal, we don't need to do anything.

  if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))

    return;


  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for

  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane

  // select is uniform.

  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&

      RI.isVGPR(MRI, Src1.getReg())) {

    Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

    const DebugLoc &DL = MI.getDebugLoc();

    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)

        .add(Src1);

    Src1.ChangeToRegister(Reg, false);

    return;

  }


  // We do not use commuteInstruction here because it is too aggressive and will

  // commute if it is possible. We only want to commute here if it improves

  // legality. This can be called a fairly large number of times so don't waste

  // compile time pointlessly swapping and checking legality again.

  if (HasImplicitSGPR || !MI.isCommutable()) {

    legalizeOpWithMove(MI, Src1Idx);

    return;

  }


  // If src0 can be used as src1, commuting will make the operands legal.

  // Otherwise we have to give up and insert a move.

  //

  // TODO: Other immediate-like operand kinds could be commuted if there was a

  // MachineOperand::ChangeTo* for them.

  if ((!Src1.isImm() && !Src1.isReg()) ||

      !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {

    legalizeOpWithMove(MI, Src1Idx);

    return;

  }


  int CommutedOpc = commuteOpcode(MI);

  if (CommutedOpc == -1) {

    legalizeOpWithMove(MI, Src1Idx);

    return;

  }


  MI.setDesc(get(CommutedOpc));


  Register Src0Reg = Src0.getReg();

  unsigned Src0SubReg = Src0.getSubReg();

  bool Src0Kill = Src0.isKill();


  if (Src1.isImm())

    Src0.ChangeToImmediate(Src1.getImm());

  else if (Src1.isReg()) {

    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());

    Src0.setSubReg(Src1.getSubReg());

  } else

    llvm_unreachable("Should only have register or immediate operands");


  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);

  Src1.setSubReg(Src0SubReg);

  fixImplicitOperands(MI);

}


// Legalize VOP3 operands. All operand types are supported for any operand

// but only one literal constant and only starting from GFX10.


void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,

                                       MachineInstr &MI) const {

  unsigned Opc = MI.getOpcode();


  int VOP3Idx[3] = {

    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),

    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),

    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)

  };


  if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||

      Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||

      Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||

      Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||

      Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||

      Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||

      Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {

    // src1 and src2 must be scalar

    MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);

    const DebugLoc &DL = MI.getDebugLoc();

    if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {

      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)

        .add(Src1);

      Src1.ChangeToRegister(Reg, false);

    }

    if (VOP3Idx[2] != -1) {

      MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);

      if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {

        Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)

            .add(Src2);

        Src2.ChangeToRegister(Reg, false);

      }

    }

  }


  // Find the one SGPR operand we are allowed to use.

  int ConstantBusLimit = ST.getConstantBusLimit(Opc);

  int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;

  SmallDenseSet<unsigned> SGPRsUsed;

  Register SGPRReg = findUsedSGPR(MI, VOP3Idx);

  if (SGPRReg) {

    SGPRsUsed.insert(SGPRReg);

    --ConstantBusLimit;

  }


  for (int Idx : VOP3Idx) {

    if (Idx == -1)

      break;

    MachineOperand &MO = MI.getOperand(Idx);


    if (!MO.isReg()) {

      if (isInlineConstant(MO, get(Opc).operands()[Idx]))

        continue;


      if (LiteralLimit > 0 && ConstantBusLimit > 0) {

        --LiteralLimit;

        --ConstantBusLimit;

        continue;

      }


      --LiteralLimit;

      --ConstantBusLimit;

      legalizeOpWithMove(MI, Idx);

      continue;

    }


    if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&

        !isOperandLegal(MI, Idx, &MO)) {

      legalizeOpWithMove(MI, Idx);

      continue;

    }


    if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))

      continue; // VGPRs are legal


    // We can use one SGPR in each VOP3 instruction prior to GFX10

    // and two starting from GFX10.

    if (SGPRsUsed.count(MO.getReg()))

      continue;

    if (ConstantBusLimit > 0) {

      SGPRsUsed.insert(MO.getReg());

      --ConstantBusLimit;

      continue;

    }


    // If we make it this far, then the operand is not legal and we must

    // legalize it.

    legalizeOpWithMove(MI, Idx);

  }


  // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.

  if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&

      !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))

    legalizeOpWithMove(MI, VOP3Idx[2]);


  if (isWMMA(MI)) {

    // scale_src has a register class restricted to low 256 VGPRs, we may need

    // to insert a copy to the restricted VGPR class.

    int ScaleSrc0Idx =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);

    if (ScaleSrc0Idx != -1) {

      int ScaleSrc1Idx =

          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);

      if (!isOperandLegal(MI, ScaleSrc0Idx))

        legalizeOpWithMove(MI, ScaleSrc0Idx);

      if (!isOperandLegal(MI, ScaleSrc1Idx))

        legalizeOpWithMove(MI, ScaleSrc1Idx);

    }

  }


  // Fix the register class of packed FP32 instructions on gfx12+. See

  // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.

  if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {

    for (unsigned I = 0; I < 3; ++I) {

      if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))

        legalizeOpWithMove(MI, VOP3Idx[I]);

    }

  }

}


Register SIInstrInfo::readlaneVGPRToSGPR(

    Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,

    const TargetRegisterClass *DstRC /*=nullptr*/) const {

  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);

  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);

  if (DstRC)

    SRC = RI.getCommonSubClass(SRC, DstRC);


  Register DstReg = MRI.createVirtualRegister(SRC);

  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;


  if (RI.hasAGPRs(VRC)) {

    VRC = RI.getEquivalentVGPRClass(VRC);

    Register NewSrcReg = MRI.createVirtualRegister(VRC);

    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),

            get(TargetOpcode::COPY), NewSrcReg)

        .addReg(SrcReg);

    SrcReg = NewSrcReg;

  }


  if (SubRegs == 1) {

    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),

            get(AMDGPU::V_READFIRSTLANE_B32), DstReg)

        .addReg(SrcReg);

    return DstReg;

  }


  SmallVector<Register, 8> SRegs;

  for (unsigned i = 0; i < SubRegs; ++i) {

    Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);

    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),

            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)

        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));

    SRegs.push_back(SGPR);

  }


  MachineInstrBuilder MIB =

      BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),

              get(AMDGPU::REG_SEQUENCE), DstReg);

  for (unsigned i = 0; i < SubRegs; ++i) {

    MIB.addReg(SRegs[i]);

    MIB.addImm(RI.getSubRegFromChannel(i));

  }

  return DstReg;

}


void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,

                                       MachineInstr &MI) const {


  // If the pointer is store in VGPRs, then we need to move them to

  // SGPRs using v_readfirstlane.  This is safe because we only select

  // loads with uniform pointers to SMRD instruction so we know the

  // pointer value is uniform.

  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);

  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {

    Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);

    SBase->setReg(SGPR);

  }

  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);

  if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {

    Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);

    SOff->setReg(SGPR);

  }

}


bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {

  unsigned Opc = Inst.getOpcode();

  int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);

  if (OldSAddrIdx < 0)

    return false;


  assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));


  int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);

  if (NewOpc < 0)

    NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);

  if (NewOpc < 0)

    return false;


  MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();

  MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);

  if (RI.isSGPRReg(MRI, SAddr.getReg()))

    return false;


  int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);

  if (NewVAddrIdx < 0)

    return false;


  int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);


  // Check vaddr, it shall be zero or absent.

  MachineInstr *VAddrDef = nullptr;

  if (OldVAddrIdx >= 0) {

    MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);

    VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());

    if (!VAddrDef || !VAddrDef->isMoveImmediate() ||

        !VAddrDef->getOperand(1).isImm() ||

        VAddrDef->getOperand(1).getImm() != 0)

      return false;

  }


  const MCInstrDesc &NewDesc = get(NewOpc);

  Inst.setDesc(NewDesc);


  // Callers expect iterator to be valid after this call, so modify the

  // instruction in place.

  if (OldVAddrIdx == NewVAddrIdx) {

    MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);

    // Clear use list from the old vaddr holding a zero register.

    MRI.removeRegOperandFromUseList(&NewVAddr);

    MRI.moveOperands(&NewVAddr, &SAddr, 1);

    Inst.removeOperand(OldSAddrIdx);

    // Update the use list with the pointer we have just moved from vaddr to

    // saddr position. Otherwise new vaddr will be missing from the use list.

    MRI.removeRegOperandFromUseList(&NewVAddr);

    MRI.addRegOperandToUseList(&NewVAddr);

  } else {

    assert(OldSAddrIdx == NewVAddrIdx);


    if (OldVAddrIdx >= 0) {

      int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,

                                                 AMDGPU::OpName::vdst_in);


      // removeOperand doesn't try to fixup tied operand indexes at it goes, so

      // it asserts. Untie the operands for now and retie them afterwards.

      if (NewVDstIn != -1) {

        int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);

        Inst.untieRegOperand(OldVDstIn);

      }


      Inst.removeOperand(OldVAddrIdx);


      if (NewVDstIn != -1) {

        int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);

        Inst.tieOperands(NewVDst, NewVDstIn);

      }

    }

  }


  if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))

    VAddrDef->eraseFromParent();


  return true;

}


// FIXME: Remove this when SelectionDAG is obsoleted.


void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,

                                       MachineInstr &MI) const {

  if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())

    return;


  // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence

  // thinks they are uniform, so a readfirstlane should be valid.

  MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);

  if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))

    return;


  if (moveFlatAddrToVGPR(MI))

    return;


  const TargetRegisterClass *DeclaredRC =

      getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);


  Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);

  SAddr->setReg(ToSGPR);

}


void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,

                                         MachineBasicBlock::iterator I,

                                         const TargetRegisterClass *DstRC,

                                         MachineOperand &Op,

                                         MachineRegisterInfo &MRI,

                                         const DebugLoc &DL) const {

  Register OpReg = Op.getReg();

  unsigned OpSubReg = Op.getSubReg();


  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(

      RI.getRegClassForReg(MRI, OpReg), OpSubReg);


  // Check if operand is already the correct register class.

  if (DstRC == OpRC)

    return;


  Register DstReg = MRI.createVirtualRegister(DstRC);

  auto Copy =

      BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);

  Op.setReg(DstReg);


  MachineInstr *Def = MRI.getVRegDef(OpReg);

  if (!Def)

    return;


  // Try to eliminate the copy if it is copying an immediate value.

  if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)

    foldImmediate(*Copy, *Def, OpReg, &MRI);


  bool ImpDef = Def->isImplicitDef();

  while (!ImpDef && Def && Def->isCopy()) {

    if (Def->getOperand(1).getReg().isPhysical())

      break;

    Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());

    ImpDef = Def && Def->isImplicitDef();

  }

  if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&

      !ImpDef)

    Copy.addReg(AMDGPU::EXEC, RegState::Implicit);

}


// Emit the actual waterfall loop, executing the wrapped instruction for each

// unique value of \p ScalarOps across all lanes. In the best case we execute 1

// iteration, in the worst case we execute 64 (once per lane).

static void


emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,

                              MachineRegisterInfo &MRI,

                              MachineBasicBlock &LoopBB,

                              MachineBasicBlock &BodyBB,

                              const DebugLoc &DL,

                              ArrayRef<MachineOperand *> ScalarOps) {

  MachineFunction &MF = *LoopBB.getParent();

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();


  MachineBasicBlock::iterator I = LoopBB.begin();

  Register CondReg;


  for (MachineOperand *ScalarOp : ScalarOps) {

    unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);

    unsigned NumSubRegs = RegSize / 32;

    Register VScalarOp = ScalarOp->getReg();


    if (NumSubRegs == 1) {

      Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);


      BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)

          .addReg(VScalarOp);


      Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);


      BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)

          .addReg(CurReg)

          .addReg(VScalarOp);


      // Combine the comparison results with AND.

      if (!CondReg) // First.

        CondReg = NewCondReg;

      else { // If not the first, we create an AND.

        Register AndReg = MRI.createVirtualRegister(BoolXExecRC);

        BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)

            .addReg(CondReg)

            .addReg(NewCondReg);

        CondReg = AndReg;

      }


      // Update ScalarOp operand to use the SGPR ScalarOp.

      ScalarOp->setReg(CurReg);

      ScalarOp->setIsKill();

    } else {

      SmallVector<Register, 8> ReadlanePieces;

      unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());

      assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&

             "Unhandled register size");


      for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {

        Register CurRegLo =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        Register CurRegHi =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);


        // Read the next variant <- also loop target.

        BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)

            .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));


        // Read the next variant <- also loop target.

        BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)

            .addReg(VScalarOp, VScalarOpUndef,

                    TRI->getSubRegFromChannel(Idx + 1));


        ReadlanePieces.push_back(CurRegLo);

        ReadlanePieces.push_back(CurRegHi);


        // Comparison is to be done as 64-bit.

        Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);

        BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)

            .addReg(CurRegLo)

            .addImm(AMDGPU::sub0)

            .addReg(CurRegHi)

            .addImm(AMDGPU::sub1);


        Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);

        auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),

                           NewCondReg)

                       .addReg(CurReg);

        if (NumSubRegs <= 2)

          Cmp.addReg(VScalarOp);

        else

          Cmp.addReg(VScalarOp, VScalarOpUndef,

                     TRI->getSubRegFromChannel(Idx, 2));


        // Combine the comparison results with AND.

        if (!CondReg) // First.

          CondReg = NewCondReg;

        else { // If not the first, we create an AND.

          Register AndReg = MRI.createVirtualRegister(BoolXExecRC);

          BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)

              .addReg(CondReg)

              .addReg(NewCondReg);

          CondReg = AndReg;

        }

      } // End for loop.


      const auto *SScalarOpRC =

          TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));

      Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);


      // Build scalar ScalarOp.

      auto Merge =

          BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);

      unsigned Channel = 0;

      for (Register Piece : ReadlanePieces) {

        Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));

      }


      // Update ScalarOp operand to use the SGPR ScalarOp.

      ScalarOp->setReg(SScalarOp);

      ScalarOp->setIsKill();

    }

  }


  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);

  MRI.setSimpleHint(SaveExec, CondReg);


  // Update EXEC to matching lanes, saving original to SaveExec.

  BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)

      .addReg(CondReg, RegState::Kill);


  // The original instruction is here; we insert the terminators after it.

  I = BodyBB.end();


  // Update EXEC, switch all done bits to 0 and all todo bits to 1.

  BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)

      .addReg(LMC.ExecReg)

      .addReg(SaveExec);


  BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);

}


// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register

// with SGPRs by iterating over all unique values across all lanes.

// Returns the loop basic block that now contains \p MI.

static MachineBasicBlock *


loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,

                               ArrayRef<MachineOperand *> ScalarOps,

                               MachineDominatorTree *MDT,

                               MachineBasicBlock::iterator Begin = nullptr,

                               MachineBasicBlock::iterator End = nullptr) {

  MachineBasicBlock &MBB = *MI.getParent();

  MachineFunction &MF = *MBB.getParent();

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  MachineRegisterInfo &MRI = MF.getRegInfo();

  if (!Begin.isValid())

    Begin = &MI;

  if (!End.isValid()) {

    End = &MI;

    ++End;

  }

  const DebugLoc &DL = MI.getDebugLoc();

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();


  // Save SCC. Waterfall Loop may overwrite SCC.

  Register SaveSCCReg;


  // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk

  // rather than unlimited scan everywhere

  bool SCCNotDead =

      MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,

                                  std::numeric_limits<unsigned>::max()) !=

      MachineBasicBlock::LQR_Dead;

  if (SCCNotDead) {

    SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)

        .addImm(1)

        .addImm(0);

  }


  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);


  // Save the EXEC mask

  BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);


  // Killed uses in the instruction we are waterfalling around will be

  // incorrect due to the added control-flow.

  MachineBasicBlock::iterator AfterMI = MI;

  ++AfterMI;

  for (auto I = Begin; I != AfterMI; I++) {

    for (auto &MO : I->all_uses())

      MRI.clearKillFlags(MO.getReg());

  }


  // To insert the loop we need to split the block. Move everything after this

  // point to a new block, and insert a new empty block between the two.

  MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();

  MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();

  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();

  MachineFunction::iterator MBBI(MBB);

  ++MBBI;


  MF.insert(MBBI, LoopBB);

  MF.insert(MBBI, BodyBB);

  MF.insert(MBBI, RemainderBB);


  LoopBB->addSuccessor(BodyBB);

  BodyBB->addSuccessor(LoopBB);

  BodyBB->addSuccessor(RemainderBB);


  // Move Begin to MI to the BodyBB, and the remainder of the block to

  // RemainderBB.

  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);

  RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());

  BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());


  MBB.addSuccessor(LoopBB);


  // Update dominators. We know that MBB immediately dominates LoopBB, that

  // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates

  // RemainderBB. RemainderBB immediately dominates all of the successors

  // transferred to it from MBB that MBB used to properly dominate.

  if (MDT) {

    MDT->addNewBlock(LoopBB, &MBB);

    MDT->addNewBlock(BodyBB, LoopBB);

    MDT->addNewBlock(RemainderBB, BodyBB);

    for (auto &Succ : RemainderBB->successors()) {

      if (MDT->properlyDominates(&MBB, Succ)) {

        MDT->changeImmediateDominator(Succ, RemainderBB);

      }

    }

  }


  emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);


  MachineBasicBlock::iterator First = RemainderBB->begin();

  // Restore SCC

  if (SCCNotDead) {

    BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))

        .addReg(SaveSCCReg, RegState::Kill)

        .addImm(0);

  }


  // Restore the EXEC mask

  BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)

      .addReg(SaveExec);

  return BodyBB;

}


// Extract pointer from Rsrc and return a zero-value Rsrc replacement.

static std::tuple<unsigned, unsigned>


extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {

  MachineBasicBlock &MBB = *MI.getParent();

  MachineFunction &MF = *MBB.getParent();

  MachineRegisterInfo &MRI = MF.getRegInfo();


  // Extract the ptr from the resource descriptor.

  unsigned RsrcPtr =

      TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,

                             AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);


  // Create an empty resource descriptor

  Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);

  Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);

  Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);

  Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);

  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();


  // Zero64 = 0

  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)

      .addImm(0);


  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}

  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)

      .addImm(Lo_32(RsrcDataFormat));


  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}

  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)

      .addImm(Hi_32(RsrcDataFormat));


  // NewSRsrc = {Zero64, SRsrcFormat}

  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)

      .addReg(Zero64)

      .addImm(AMDGPU::sub0_sub1)

      .addReg(SRsrcFormatLo)

      .addImm(AMDGPU::sub2)

      .addReg(SRsrcFormatHi)

      .addImm(AMDGPU::sub3);


  return std::tuple(RsrcPtr, NewSRsrc);

}


MachineBasicBlock *


SIInstrInfo::legalizeOperands(MachineInstr &MI,

                              MachineDominatorTree *MDT) const {

  MachineFunction &MF = *MI.getParent()->getParent();

  MachineRegisterInfo &MRI = MF.getRegInfo();

  MachineBasicBlock *CreatedBB = nullptr;


  // Legalize VOP2

  if (isVOP2(MI) || isVOPC(MI)) {

    legalizeOperandsVOP2(MRI, MI);

    return CreatedBB;

  }


  // Legalize VOP3

  if (isVOP3(MI)) {

    legalizeOperandsVOP3(MRI, MI);

    return CreatedBB;

  }


  // Legalize SMRD

  if (isSMRD(MI)) {

    legalizeOperandsSMRD(MRI, MI);

    return CreatedBB;

  }


  // Legalize FLAT

  if (isFLAT(MI)) {

    legalizeOperandsFLAT(MRI, MI);

    return CreatedBB;

  }


  // Legalize REG_SEQUENCE and PHI

  // The register class of the operands much be the same type as the register

  // class of the output.

  if (MI.getOpcode() == AMDGPU::PHI) {

    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;

    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {

      if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())

        continue;

      const TargetRegisterClass *OpRC =

          MRI.getRegClass(MI.getOperand(i).getReg());

      if (RI.hasVectorRegisters(OpRC)) {

        VRC = OpRC;

      } else {

        SRC = OpRC;

      }

    }


    // If any of the operands are VGPR registers, then they all most be

    // otherwise we will create illegal VGPR->SGPR copies when legalizing

    // them.

    if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {

      if (!VRC) {

        assert(SRC);

        if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {

          VRC = &AMDGPU::VReg_1RegClass;

        } else

          VRC = RI.isAGPRClass(getOpRegClass(MI, 0))

                    ? RI.getEquivalentAGPRClass(SRC)

                    : RI.getEquivalentVGPRClass(SRC);

      } else {

        VRC = RI.isAGPRClass(getOpRegClass(MI, 0))

                  ? RI.getEquivalentAGPRClass(VRC)

                  : RI.getEquivalentVGPRClass(VRC);

      }

      RC = VRC;

    } else {

      RC = SRC;

    }


    // Update all the operands so they have the same type.

    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {

      MachineOperand &Op = MI.getOperand(I);

      if (!Op.isReg() || !Op.getReg().isVirtual())

        continue;


      // MI is a PHI instruction.

      MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();

      MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();


      // Avoid creating no-op copies with the same src and dst reg class.  These

      // confuse some of the machine passes.

      legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());

    }

  }


  // REG_SEQUENCE doesn't really require operand legalization, but if one has a

  // VGPR dest type and SGPR sources, insert copies so all operands are

  // VGPRs. This seems to help operand folding / the register coalescer.

  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {

    MachineBasicBlock *MBB = MI.getParent();

    const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);

    if (RI.hasVGPRs(DstRC)) {

      // Update all the operands so they are VGPR register classes. These may

      // not be the same register class because REG_SEQUENCE supports mixing

      // subregister index types e.g. sub0_sub1 + sub2 + sub3

      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {

        MachineOperand &Op = MI.getOperand(I);

        if (!Op.isReg() || !Op.getReg().isVirtual())

          continue;


        const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());

        const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);

        if (VRC == OpRC)

          continue;


        legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());

        Op.setIsKill();

      }

    }


    return CreatedBB;

  }


  // Legalize INSERT_SUBREG

  // src0 must have the same register class as dst

  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {

    Register Dst = MI.getOperand(0).getReg();

    Register Src0 = MI.getOperand(1).getReg();

    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);

    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);

    if (DstRC != Src0RC) {

      MachineBasicBlock *MBB = MI.getParent();

      MachineOperand &Op = MI.getOperand(1);

      legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());

    }

    return CreatedBB;

  }


  // Legalize SI_INIT_M0

  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {

    MachineOperand &Src = MI.getOperand(0);

    if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))

      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));

    return CreatedBB;

  }


  // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM

  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||

      MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||

      MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||

      MI.getOpcode() == AMDGPU::S_WQM_B32 ||

      MI.getOpcode() == AMDGPU::S_WQM_B64 ||

      MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||

      MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {

    MachineOperand &Src = MI.getOperand(1);

    if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))

      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));

    return CreatedBB;

  }


  // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.

  //

  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via

  // scratch memory access. In both cases, the legalization never involves

  // conversion to the addr64 form.

  if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&

                      (isMUBUF(MI) || isMTBUF(MI)))) {

    AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))

                                    ? AMDGPU::OpName::rsrc

                                    : AMDGPU::OpName::srsrc;

    MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);

    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))

      CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);


    AMDGPU::OpName SampOpName =

        isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;

    MachineOperand *SSamp = getNamedOperand(MI, SampOpName);

    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))

      CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);


    return CreatedBB;

  }


  // Legalize SI_CALL

  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {

    MachineOperand *Dest = &MI.getOperand(0);

    if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {

      // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and

      // following copies, we also need to move copies from and to physical

      // registers into the loop block.

      unsigned FrameSetupOpcode = getCallFrameSetupOpcode();

      unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();


      // Also move the copies to physical registers into the loop block

      MachineBasicBlock &MBB = *MI.getParent();

      MachineBasicBlock::iterator Start(&MI);

      while (Start->getOpcode() != FrameSetupOpcode)

        --Start;

      MachineBasicBlock::iterator End(&MI);

      while (End->getOpcode() != FrameDestroyOpcode)

        ++End;

      // Also include following copies of the return value

      ++End;

      while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&

             MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))

        ++End;

      CreatedBB =

          loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);

    }

  }


  // Legalize s_sleep_var.

  if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {

    const DebugLoc &DL = MI.getDebugLoc();

    Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

    int Src0Idx =

        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);

    MachineOperand &Src0 = MI.getOperand(Src0Idx);

    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)

        .add(Src0);

    Src0.ChangeToRegister(Reg, false);

    return nullptr;

  }


  // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,

  // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.

  if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||

      MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||

      MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||

      MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {

    for (MachineOperand &Src : MI.explicit_operands()) {

      if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))

        Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));

    }

    return CreatedBB;

  }


  // Legalize MUBUF instructions.

  bool isSoffsetLegal = true;

  int SoffsetIdx =

      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);

  if (SoffsetIdx != -1) {

    MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);

    if (Soffset->isReg() && Soffset->getReg().isVirtual() &&

        !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {

      isSoffsetLegal = false;

    }

  }


  bool isRsrcLegal = true;

  int RsrcIdx =

      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);

  if (RsrcIdx != -1) {

    MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);

    if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))

      isRsrcLegal = false;

  }


  // The operands are legal.

  if (isRsrcLegal && isSoffsetLegal)

    return CreatedBB;


  if (!isRsrcLegal) {

    // Legalize a VGPR Rsrc

    //

    // If the instruction is _ADDR64, we can avoid a waterfall by extracting

    // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using

    // a zero-value SRsrc.

    //

    // If the instruction is _OFFSET (both idxen and offen disabled), and we

    // support ADDR64 instructions, we can convert to ADDR64 and do the same as

    // above.

    //

    // Otherwise we are on non-ADDR64 hardware, and/or we have

    // idxen/offen/bothen and we fall back to a waterfall loop.


    MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);

    MachineBasicBlock &MBB = *MI.getParent();


    MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);

    if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {

      // This is already an ADDR64 instruction so we need to add the pointer

      // extracted from the resource descriptor to the current value of VAddr.

      Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

      Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

      Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);


      const auto *BoolXExecRC = RI.getWaveMaskRegClass();

      Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);

      Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);


      unsigned RsrcPtr, NewSRsrc;

      std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);


      // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0

      const DebugLoc &DL = MI.getDebugLoc();

      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)

        .addDef(CondReg0)

        .addReg(RsrcPtr, 0, AMDGPU::sub0)

        .addReg(VAddr->getReg(), 0, AMDGPU::sub0)

        .addImm(0);


      // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1

      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)

        .addDef(CondReg1, RegState::Dead)

        .addReg(RsrcPtr, 0, AMDGPU::sub1)

        .addReg(VAddr->getReg(), 0, AMDGPU::sub1)

        .addReg(CondReg0, RegState::Kill)

        .addImm(0);


      // NewVaddr = {NewVaddrHi, NewVaddrLo}

      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)

          .addReg(NewVAddrLo)

          .addImm(AMDGPU::sub0)

          .addReg(NewVAddrHi)

          .addImm(AMDGPU::sub1);


      VAddr->setReg(NewVAddr);

      Rsrc->setReg(NewSRsrc);

    } else if (!VAddr && ST.hasAddr64()) {

      // This instructions is the _OFFSET variant, so we need to convert it to

      // ADDR64.

      assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&

             "FIXME: Need to emit flat atomics here");


      unsigned RsrcPtr, NewSRsrc;

      std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);


      Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);

      MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);

      MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);

      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);

      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());


      // Atomics with return have an additional tied operand and are

      // missing some of the special bits.

      MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);

      MachineInstr *Addr64;


      if (!VDataIn) {

        // Regular buffer load / store.

        MachineInstrBuilder MIB =

            BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))

                .add(*VData)

                .addReg(NewVAddr)

                .addReg(NewSRsrc)

                .add(*SOffset)

                .add(*Offset);


        if (const MachineOperand *CPol =

                getNamedOperand(MI, AMDGPU::OpName::cpol)) {

          MIB.addImm(CPol->getImm());

        }


        if (const MachineOperand *TFE =

                getNamedOperand(MI, AMDGPU::OpName::tfe)) {

          MIB.addImm(TFE->getImm());

        }


        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));


        MIB.cloneMemRefs(MI);

        Addr64 = MIB;

      } else {

        // Atomics with return.

        Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))

                     .add(*VData)

                     .add(*VDataIn)

                     .addReg(NewVAddr)

                     .addReg(NewSRsrc)

                     .add(*SOffset)

                     .add(*Offset)

                     .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))

                     .cloneMemRefs(MI);

      }


      MI.removeFromParent();


      // NewVaddr = {NewVaddrHi, NewVaddrLo}

      BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),

              NewVAddr)

          .addReg(RsrcPtr, 0, AMDGPU::sub0)

          .addImm(AMDGPU::sub0)

          .addReg(RsrcPtr, 0, AMDGPU::sub1)

          .addImm(AMDGPU::sub1);

    } else {

      // Legalize a VGPR Rsrc and soffset together.

      if (!isSoffsetLegal) {

        MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);

        CreatedBB =

            loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);

        return CreatedBB;

      }

      CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);

      return CreatedBB;

    }

  }


  // Legalize a VGPR soffset.

  if (!isSoffsetLegal) {

    MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);

    CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);

    return CreatedBB;

  }

  return CreatedBB;

}


void SIInstrWorklist::insert(MachineInstr *MI) {

  InstrList.insert(MI);

  // Add MBUF instructiosn to deferred list.

  int RsrcIdx =

      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);

  if (RsrcIdx != -1) {

    DeferredList.insert(MI);

  }

}


bool SIInstrWorklist::isDeferred(MachineInstr *MI) {

  return DeferredList.contains(MI);

}


// Legalize size mismatches between 16bit and 32bit registers in v2s copy

// lowering (change spgr to vgpr).

// This is mainly caused by 16bit SALU and 16bit VALU using reg with different

// size. Need to legalize the size of the operands during the vgpr lowering

// chain. This can be removed after we have sgpr16 in place


void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,

                                          MachineRegisterInfo &MRI) const {

  if (!ST.useRealTrue16Insts())

    return;


  unsigned Opcode = MI.getOpcode();

  MachineBasicBlock *MBB = MI.getParent();

  // Legalize operands and check for size mismatch

  if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||

      OpIdx >= get(Opcode).getNumOperands() ||

      get(Opcode).operands()[OpIdx].RegClass == -1)

    return;


  MachineOperand &Op = MI.getOperand(OpIdx);

  if (!Op.isReg() || !Op.getReg().isVirtual())

    return;


  const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());

  if (!RI.isVGPRClass(CurrRC))

    return;


  unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;

  const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);

  if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {

    Op.setSubReg(AMDGPU::lo16);

  } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {

    const DebugLoc &DL = MI.getDebugLoc();

    Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);

    BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);

    BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)

        .addReg(Op.getReg())

        .addImm(AMDGPU::lo16)

        .addReg(Undef)

        .addImm(AMDGPU::hi16);

    Op.setReg(NewDstReg);

  }

}


void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,

                                          MachineRegisterInfo &MRI) const {

  for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)

    legalizeOperandsVALUt16(MI, OpIdx, MRI);

}


void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,

                             MachineDominatorTree *MDT) const {


  while (!Worklist.empty()) {

    MachineInstr &Inst = *Worklist.top();

    Worklist.erase_top();

    // Skip MachineInstr in the deferred list.

    if (Worklist.isDeferred(&Inst))

      continue;

    moveToVALUImpl(Worklist, MDT, Inst);

  }


  // Deferred list of instructions will be processed once

  // all the MachineInstr in the worklist are done.

  for (MachineInstr *Inst : Worklist.getDeferredList()) {

    moveToVALUImpl(Worklist, MDT, *Inst);

    assert(Worklist.empty() &&

           "Deferred MachineInstr are not supposed to re-populate worklist");

  }

}


void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,

                                 MachineDominatorTree *MDT,

                                 MachineInstr &Inst) const {


  MachineBasicBlock *MBB = Inst.getParent();

  if (!MBB)

    return;

  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

  unsigned Opcode = Inst.getOpcode();

  unsigned NewOpcode = getVALUOp(Inst);

  // Handle some special cases

  switch (Opcode) {

  default:

    break;

  case AMDGPU::S_ADD_I32:

  case AMDGPU::S_SUB_I32: {

    // FIXME: The u32 versions currently selected use the carry.

    bool Changed;

    MachineBasicBlock *CreatedBBTmp = nullptr;

    std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);

    if (Changed)

      return;


    // Default handling

    break;

  }


  case AMDGPU::S_MUL_U64:

    if (ST.hasVectorMulU64()) {

      NewOpcode = AMDGPU::V_MUL_U64_e64;

      break;

    }

    // Split s_mul_u64 in 32-bit vector multiplications.

    splitScalarSMulU64(Worklist, Inst, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_MUL_U64_U32_PSEUDO:

  case AMDGPU::S_MUL_I64_I32_PSEUDO:

    // This is a special case of s_mul_u64 where all the operands are either

    // zero extended or sign extended.

    splitScalarSMulPseudo(Worklist, Inst, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_AND_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_OR_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_XOR_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_NAND_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_NOR_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_XNOR_B64:

    if (ST.hasDLInsts())

      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);

    else

      splitScalar64BitXnor(Worklist, Inst, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_ANDN2_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_ORN2_B64:

    splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_BREV_B64:

    splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_NOT_B64:

    splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_BCNT1_I32_B64:

    splitScalar64BitBCNT(Worklist, Inst);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_BFE_I64:

    splitScalar64BitBFE(Worklist, Inst);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_FLBIT_I32_B64:

    splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);

    Inst.eraseFromParent();

    return;

  case AMDGPU::S_FF1_I32_B64:

    splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_LSHL_B32:

    if (ST.hasOnlyRevVALUShifts()) {

      NewOpcode = AMDGPU::V_LSHLREV_B32_e64;

      swapOperands(Inst);

    }

    break;

  case AMDGPU::S_ASHR_I32:

    if (ST.hasOnlyRevVALUShifts()) {

      NewOpcode = AMDGPU::V_ASHRREV_I32_e64;

      swapOperands(Inst);

    }

    break;

  case AMDGPU::S_LSHR_B32:

    if (ST.hasOnlyRevVALUShifts()) {

      NewOpcode = AMDGPU::V_LSHRREV_B32_e64;

      swapOperands(Inst);

    }

    break;

  case AMDGPU::S_LSHL_B64:

    if (ST.hasOnlyRevVALUShifts()) {

      NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12

                      ? AMDGPU::V_LSHLREV_B64_pseudo_e64

                      : AMDGPU::V_LSHLREV_B64_e64;

      swapOperands(Inst);

    }

    break;

  case AMDGPU::S_ASHR_I64:

    if (ST.hasOnlyRevVALUShifts()) {

      NewOpcode = AMDGPU::V_ASHRREV_I64_e64;

      swapOperands(Inst);

    }

    break;

  case AMDGPU::S_LSHR_B64:

    if (ST.hasOnlyRevVALUShifts()) {

      NewOpcode = AMDGPU::V_LSHRREV_B64_e64;

      swapOperands(Inst);

    }

    break;


  case AMDGPU::S_ABS_I32:

    lowerScalarAbs(Worklist, Inst);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_CBRANCH_SCC0:

  case AMDGPU::S_CBRANCH_SCC1: {

    // Clear unused bits of vcc

    Register CondReg = Inst.getOperand(1).getReg();

    bool IsSCC = CondReg == AMDGPU::SCC;

    const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

    BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)

        .addReg(LMC.ExecReg)

        .addReg(IsSCC ? LMC.VccReg : CondReg);

    Inst.removeOperand(1);

  } break;


  case AMDGPU::S_BFE_U64:

  case AMDGPU::S_BFM_B64:

    llvm_unreachable("Moving this op to VALU not implemented");


  case AMDGPU::S_PACK_LL_B32_B16:

  case AMDGPU::S_PACK_LH_B32_B16:

  case AMDGPU::S_PACK_HL_B32_B16:

  case AMDGPU::S_PACK_HH_B32_B16:

    movePackToVALU(Worklist, MRI, Inst);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_XNOR_B32:

    lowerScalarXnor(Worklist, Inst);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_NAND_B32:

    splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_NOR_B32:

    splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_ANDN2_B32:

    splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);

    Inst.eraseFromParent();

    return;


  case AMDGPU::S_ORN2_B32:

    splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);

    Inst.eraseFromParent();

    return;


  // TODO: remove as soon as everything is ready

  // to replace VGPR to SGPR copy with V_READFIRSTLANEs.

  // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO

  // can only be selected from the uniform SDNode.

  case AMDGPU::S_ADD_CO_PSEUDO:

  case AMDGPU::S_SUB_CO_PSEUDO: {

    unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)

                       ? AMDGPU::V_ADDC_U32_e64

                       : AMDGPU::V_SUBB_U32_e64;

    const auto *CarryRC = RI.getWaveMaskRegClass();


    Register CarryInReg = Inst.getOperand(4).getReg();

    if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {

      Register NewCarryReg = MRI.createVirtualRegister(CarryRC);

      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)

          .addReg(CarryInReg);

    }


    Register CarryOutReg = Inst.getOperand(1).getReg();


    Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(

        MRI.getRegClass(Inst.getOperand(0).getReg())));

    MachineInstr *CarryOp =

        BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)

            .addReg(CarryOutReg, RegState::Define)

            .add(Inst.getOperand(2))

            .add(Inst.getOperand(3))

            .addReg(CarryInReg)

            .addImm(0);

    legalizeOperands(*CarryOp);

    MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);

    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);

    Inst.eraseFromParent();

  }

    return;

  case AMDGPU::S_UADDO_PSEUDO:

  case AMDGPU::S_USUBO_PSEUDO: {

    const DebugLoc &DL = Inst.getDebugLoc();

    MachineOperand &Dest0 = Inst.getOperand(0);

    MachineOperand &Dest1 = Inst.getOperand(1);

    MachineOperand &Src0 = Inst.getOperand(2);

    MachineOperand &Src1 = Inst.getOperand(3);


    unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)

                       ? AMDGPU::V_ADD_CO_U32_e64

                       : AMDGPU::V_SUB_CO_U32_e64;

    const TargetRegisterClass *NewRC =

        RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));

    Register DestReg = MRI.createVirtualRegister(NewRC);

    MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)

                                 .addReg(Dest1.getReg(), RegState::Define)

                                 .add(Src0)

                                 .add(Src1)

                                 .addImm(0); // clamp bit


    legalizeOperands(*NewInstr, MDT);

    MRI.replaceRegWith(Dest0.getReg(), DestReg);

    addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,

                                 Worklist);

    Inst.eraseFromParent();

  }

    return;


  case AMDGPU::S_CSELECT_B32:

  case AMDGPU::S_CSELECT_B64:

    lowerSelect(Worklist, Inst, MDT);

    Inst.eraseFromParent();

    return;

  case AMDGPU::S_CMP_EQ_I32:

  case AMDGPU::S_CMP_LG_I32:

  case AMDGPU::S_CMP_GT_I32:

  case AMDGPU::S_CMP_GE_I32:

  case AMDGPU::S_CMP_LT_I32:

  case AMDGPU::S_CMP_LE_I32:

  case AMDGPU::S_CMP_EQ_U32:

  case AMDGPU::S_CMP_LG_U32:

  case AMDGPU::S_CMP_GT_U32:

  case AMDGPU::S_CMP_GE_U32:

  case AMDGPU::S_CMP_LT_U32:

  case AMDGPU::S_CMP_LE_U32:

  case AMDGPU::S_CMP_EQ_U64:

  case AMDGPU::S_CMP_LG_U64:

  case AMDGPU::S_CMP_LT_F32:

  case AMDGPU::S_CMP_EQ_F32:

  case AMDGPU::S_CMP_LE_F32:

  case AMDGPU::S_CMP_GT_F32:

  case AMDGPU::S_CMP_LG_F32:

  case AMDGPU::S_CMP_GE_F32:

  case AMDGPU::S_CMP_O_F32:

  case AMDGPU::S_CMP_U_F32:

  case AMDGPU::S_CMP_NGE_F32:

  case AMDGPU::S_CMP_NLG_F32:

  case AMDGPU::S_CMP_NGT_F32:

  case AMDGPU::S_CMP_NLE_F32:

  case AMDGPU::S_CMP_NEQ_F32:

  case AMDGPU::S_CMP_NLT_F32: {

    Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());

    auto NewInstr =

        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)

            .setMIFlags(Inst.getFlags());

    if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=

        0) {

      NewInstr

          .addImm(0)               // src0_modifiers

          .add(Inst.getOperand(0)) // src0

          .addImm(0)               // src1_modifiers

          .add(Inst.getOperand(1)) // src1

          .addImm(0);              // clamp

    } else {

      NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));

    }

    legalizeOperands(*NewInstr, MDT);

    int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);

    MachineOperand SCCOp = Inst.getOperand(SCCIdx);

    addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);

    Inst.eraseFromParent();

    return;

  }

  case AMDGPU::S_CMP_LT_F16:

  case AMDGPU::S_CMP_EQ_F16:

  case AMDGPU::S_CMP_LE_F16:

  case AMDGPU::S_CMP_GT_F16:

  case AMDGPU::S_CMP_LG_F16:

  case AMDGPU::S_CMP_GE_F16:

  case AMDGPU::S_CMP_O_F16:

  case AMDGPU::S_CMP_U_F16:

  case AMDGPU::S_CMP_NGE_F16:

  case AMDGPU::S_CMP_NLG_F16:

  case AMDGPU::S_CMP_NGT_F16:

  case AMDGPU::S_CMP_NLE_F16:

  case AMDGPU::S_CMP_NEQ_F16:

  case AMDGPU::S_CMP_NLT_F16: {

    Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());

    auto NewInstr =

        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)

        .setMIFlags(Inst.getFlags());

    if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {

      NewInstr

          .addImm(0)               // src0_modifiers

          .add(Inst.getOperand(0)) // src0

          .addImm(0)               // src1_modifiers

          .add(Inst.getOperand(1)) // src1

          .addImm(0);              // clamp

      if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))

        NewInstr.addImm(0); // op_sel0

    } else {

      NewInstr

          .add(Inst.getOperand(0))

          .add(Inst.getOperand(1));

    }

    legalizeOperandsVALUt16(*NewInstr, MRI);

    legalizeOperands(*NewInstr, MDT);

    int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);

    MachineOperand SCCOp = Inst.getOperand(SCCIdx);

    addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);

    Inst.eraseFromParent();

    return;

  }

  case AMDGPU::S_CVT_HI_F32_F16: {

    const DebugLoc &DL = Inst.getDebugLoc();

    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    if (ST.useRealTrue16Insts()) {

      BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)

          .add(Inst.getOperand(1));

      BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)

          .addImm(0) // src0_modifiers

          .addReg(TmpReg, 0, AMDGPU::hi16)

          .addImm(0)  // clamp

          .addImm(0)  // omod

          .addImm(0); // op_sel0

    } else {

      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)

          .addImm(16)

          .add(Inst.getOperand(1));

      BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)

          .addImm(0) // src0_modifiers

          .addReg(TmpReg)

          .addImm(0)  // clamp

          .addImm(0); // omod

    }


    MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);

    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

    Inst.eraseFromParent();

    return;

  }

  case AMDGPU::S_MINIMUM_F32:

  case AMDGPU::S_MAXIMUM_F32: {

    const DebugLoc &DL = Inst.getDebugLoc();

    Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)

                                 .addImm(0) // src0_modifiers

                                 .add(Inst.getOperand(1))

                                 .addImm(0) // src1_modifiers

                                 .add(Inst.getOperand(2))

                                 .addImm(0)  // clamp

                                 .addImm(0); // omod

    MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);


    legalizeOperands(*NewInstr, MDT);

    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

    Inst.eraseFromParent();

    return;

  }

  case AMDGPU::S_MINIMUM_F16:

  case AMDGPU::S_MAXIMUM_F16: {

    const DebugLoc &DL = Inst.getDebugLoc();

    Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()

                                                    ? &AMDGPU::VGPR_16RegClass

                                                    : &AMDGPU::VGPR_32RegClass);

    MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)

                                 .addImm(0) // src0_modifiers

                                 .add(Inst.getOperand(1))

                                 .addImm(0) // src1_modifiers

                                 .add(Inst.getOperand(2))

                                 .addImm(0)  // clamp

                                 .addImm(0)  // omod

                                 .addImm(0); // opsel0

    MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);

    legalizeOperandsVALUt16(*NewInstr, MRI);

    legalizeOperands(*NewInstr, MDT);

    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

    Inst.eraseFromParent();

    return;

  }

  case AMDGPU::V_S_EXP_F16_e64:

  case AMDGPU::V_S_LOG_F16_e64:

  case AMDGPU::V_S_RCP_F16_e64:

  case AMDGPU::V_S_RSQ_F16_e64:

  case AMDGPU::V_S_SQRT_F16_e64: {

    const DebugLoc &DL = Inst.getDebugLoc();

    Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()

                                                    ? &AMDGPU::VGPR_16RegClass

                                                    : &AMDGPU::VGPR_32RegClass);

    auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)

                        .add(Inst.getOperand(1)) // src0_modifiers

                        .add(Inst.getOperand(2))

                        .add(Inst.getOperand(3)) // clamp

                        .add(Inst.getOperand(4)) // omod

                        .setMIFlags(Inst.getFlags());

    if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))

      NewInstr.addImm(0); // opsel0

    MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);

    legalizeOperandsVALUt16(*NewInstr, MRI);

    legalizeOperands(*NewInstr, MDT);

    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

    Inst.eraseFromParent();

    return;

  }

  }


  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

    // We cannot move this instruction to the VALU, so we should try to

    // legalize its operands instead.

    legalizeOperands(Inst, MDT);

    return;

  }

  // Handle converting generic instructions like COPY-to-SGPR into

  // COPY-to-VGPR.

  if (NewOpcode == Opcode) {

    Register DstReg = Inst.getOperand(0).getReg();

    const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);


    // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and

    // hope for the best.

    if (Inst.isCopy() && DstReg.isPhysical() &&

        RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {

      // TODO: Only works for 32 bit registers.

      if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {

        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),

                get(AMDGPU::V_READFIRSTLANE_B32), DstReg)

            .add(Inst.getOperand(1));

      } else {

        Register NewDst =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),

                get(AMDGPU::V_READFIRSTLANE_B32), NewDst)

            .add(Inst.getOperand(1));

        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),

                DstReg)

            .addReg(NewDst);

      }

      Inst.eraseFromParent();

      return;

    }


    if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&

        NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {

      // Instead of creating a copy where src and dst are the same register

      // class, we just replace all uses of dst with src.  These kinds of

      // copies interfere with the heuristics MachineSink uses to decide

      // whether or not to split a critical edge.  Since the pass assumes

      // that copies will end up as machine instructions and not be

      // eliminated.

      addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);

      Register NewDstReg = Inst.getOperand(1).getReg();

      MRI.replaceRegWith(DstReg, NewDstReg);

      MRI.clearKillFlags(NewDstReg);

      Inst.getOperand(0).setReg(DstReg);

      Inst.eraseFromParent();

      // Legalize t16 operand since replaceReg is called after addUsersToVALU

      for (MachineOperand &MO :

           make_early_inc_range(MRI.use_operands(NewDstReg))) {

        legalizeOperandsVALUt16(*MO.getParent(), MRI);

      }

      return;

    }


    // If this is a v2s copy between 16bit and 32bit reg,

    // replace vgpr copy to reg_sequence/extract_subreg

    // This can be remove after we have sgpr16 in place

    if (ST.useRealTrue16Insts() && Inst.isCopy() &&

        Inst.getOperand(1).getReg().isVirtual() &&

        RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {

      const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);

      if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {

        Register NewDstReg = MRI.createVirtualRegister(NewDstRC);

        Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);

        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),

                get(AMDGPU::IMPLICIT_DEF), Undef);

        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),

                get(AMDGPU::REG_SEQUENCE), NewDstReg)

            .addReg(Inst.getOperand(1).getReg())

            .addImm(AMDGPU::lo16)

            .addReg(Undef)

            .addImm(AMDGPU::hi16);

        Inst.eraseFromParent();

        MRI.replaceRegWith(DstReg, NewDstReg);

        addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);

        return;

      } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,

                                             AMDGPU::lo16)) {

        Inst.getOperand(1).setSubReg(AMDGPU::lo16);

        Register NewDstReg = MRI.createVirtualRegister(NewDstRC);

        MRI.replaceRegWith(DstReg, NewDstReg);

        addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);

        return;

      }

    }


    Register NewDstReg = MRI.createVirtualRegister(NewDstRC);

    MRI.replaceRegWith(DstReg, NewDstReg);

    legalizeOperands(Inst, MDT);

    addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);

    return;

  }


  // Use the new VALU Opcode.

  auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))

                      .setMIFlags(Inst.getFlags());

  if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {

    // Intersperse VOP3 modifiers among the SALU operands.

    NewInstr->addOperand(Inst.getOperand(0));

    if (AMDGPU::getNamedOperandIdx(NewOpcode,

                                   AMDGPU::OpName::src0_modifiers) >= 0)

      NewInstr.addImm(0);

    if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {

      MachineOperand Src = Inst.getOperand(1);

      NewInstr->addOperand(Src);

    }


    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {

      // We are converting these to a BFE, so we need to add the missing

      // operands for the size and offset.

      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;

      NewInstr.addImm(0);

      NewInstr.addImm(Size);

    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {

      // The VALU version adds the second operand to the result, so insert an

      // extra 0 operand.

      NewInstr.addImm(0);

    } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {

      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);

      // If we need to move this to VGPRs, we need to unpack the second

      // operand back into the 2 separate ones for bit offset and width.

      assert(OffsetWidthOp.isImm() &&

             "Scalar BFE is only implemented for constant width and offset");

      uint32_t Imm = OffsetWidthOp.getImm();


      uint32_t Offset = Imm & 0x3f;               // Extract bits [5:0].

      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].

      NewInstr.addImm(Offset);

      NewInstr.addImm(BitWidth);

    } else {

      if (AMDGPU::getNamedOperandIdx(NewOpcode,

                                     AMDGPU::OpName::src1_modifiers) >= 0)

        NewInstr.addImm(0);

      if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)

        NewInstr->addOperand(Inst.getOperand(2));

      if (AMDGPU::getNamedOperandIdx(NewOpcode,

                                     AMDGPU::OpName::src2_modifiers) >= 0)

        NewInstr.addImm(0);

      if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)

        NewInstr->addOperand(Inst.getOperand(3));

      if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)

        NewInstr.addImm(0);

      if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)

        NewInstr.addImm(0);

      if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)

        NewInstr.addImm(0);

    }

  } else {

    // Just copy the SALU operands.

    for (const MachineOperand &Op : Inst.explicit_operands())

      NewInstr->addOperand(Op);

  }


  // Remove any references to SCC. Vector instructions can't read from it, and

  // We're just about to add the implicit use / defs of VCC, and we don't want

  // both.

  for (MachineOperand &Op : Inst.implicit_operands()) {

    if (Op.getReg() == AMDGPU::SCC) {

      // Only propagate through live-def of SCC.

      if (Op.isDef() && !Op.isDead())

        addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);

      if (Op.isUse())

        addSCCDefsToVALUWorklist(NewInstr, Worklist);

    }

  }

  Inst.eraseFromParent();

  Register NewDstReg;

  if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {

    Register DstReg = NewInstr->getOperand(0).getReg();

    assert(DstReg.isVirtual());

    // Update the destination register class.

    const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);

    assert(NewDstRC);

    NewDstReg = MRI.createVirtualRegister(NewDstRC);

    MRI.replaceRegWith(DstReg, NewDstReg);

  }

  fixImplicitOperands(*NewInstr);


  legalizeOperandsVALUt16(*NewInstr, MRI);


  // Legalize the operands

  legalizeOperands(*NewInstr, MDT);

  if (NewDstReg)

    addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);

}


// Add/sub require special handling to deal with carry outs.

std::pair<bool, MachineBasicBlock *>

SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,

                              MachineDominatorTree *MDT) const {

  if (ST.hasAddNoCarry()) {

    // Assume there is no user of scc since we don't select this in that case.

    // Since scc isn't used, it doesn't really matter if the i32 or u32 variant

    // is used.


    MachineBasicBlock &MBB = *Inst.getParent();

    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


    Register OldDstReg = Inst.getOperand(0).getReg();

    Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


    unsigned Opc = Inst.getOpcode();

    assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);


    unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?

      AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;


    assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);

    Inst.removeOperand(3);


    Inst.setDesc(get(NewOpc));

    Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit

    Inst.addImplicitDefUseOperands(*MBB.getParent());

    MRI.replaceRegWith(OldDstReg, ResultReg);

    MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);


    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);

    return std::pair(true, NewBB);

  }


  return std::pair(false, nullptr);

}


void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,

                              MachineDominatorTree *MDT) const {


  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  DebugLoc DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);

  MachineOperand &Cond = Inst.getOperand(3);


  Register CondReg = Cond.getReg();

  bool IsSCC = (CondReg == AMDGPU::SCC);


  // If this is a trivial select where the condition is effectively not SCC

  // (CondReg is a source of copy to SCC), then the select is semantically

  // equivalent to copying CondReg. Hence, there is no need to create

  // V_CNDMASK, we can just use that and bail out.

  if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&

      (Src1.getImm() == 0)) {

    MRI.replaceRegWith(Dest.getReg(), CondReg);

    return;

  }


  Register NewCondReg = CondReg;

  if (IsSCC) {

    const TargetRegisterClass *TC = RI.getWaveMaskRegClass();

    NewCondReg = MRI.createVirtualRegister(TC);


    // Now look for the closest SCC def if it is a copy

    // replacing the CondReg with the COPY source register

    bool CopyFound = false;

    for (MachineInstr &CandI :

         make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),

                    Inst.getParent()->rend())) {

      if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=

          -1) {

        if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {

          BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)

              .addReg(CandI.getOperand(1).getReg());

          CopyFound = true;

        }

        break;

      }

    }

    if (!CopyFound) {

      // SCC def is not a copy

      // Insert a trivial select instead of creating a copy, because a copy from

      // SCC would semantically mean just copying a single bit, but we may need

      // the result to be a vector condition mask that needs preserving.

      unsigned Opcode =

          ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;

      auto NewSelect =

          BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);

      NewSelect->getOperand(3).setIsUndef(Cond.isUndef());

    }

  }


  Register NewDestReg = MRI.createVirtualRegister(

      RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));

  MachineInstr *NewInst;

  if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {

    NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)

                  .addImm(0)

                  .add(Src1) // False

                  .addImm(0)

                  .add(Src0) // True

                  .addReg(NewCondReg);

  } else {

    NewInst =

        BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)

            .add(Src1) // False

            .add(Src0) // True

            .addReg(NewCondReg);

  }

  MRI.replaceRegWith(Dest.getReg(), NewDestReg);

  legalizeOperands(*NewInst, MDT);

  addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);

}


void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,

                                 MachineInstr &Inst) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  DebugLoc DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src = Inst.getOperand(1);

  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  unsigned SubOp = ST.hasAddNoCarry() ?

    AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;


  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)

    .addImm(0)

    .addReg(Src.getReg());


  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)

    .addReg(Src.getReg())

    .addReg(TmpReg);


  MRI.replaceRegWith(Dest.getReg(), ResultReg);

  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);

}


void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,

                                  MachineInstr &Inst) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);


  if (ST.hasDLInsts()) {

    Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);

    legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);


    BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)

      .add(Src0)

      .add(Src1);


    MRI.replaceRegWith(Dest.getReg(), NewDest);

    addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);

  } else {

    // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can

    // invert either source and then perform the XOR. If either source is a

    // scalar register, then we can leave the inversion on the scalar unit to

    // achieve a better distribution of scalar and vector instructions.

    bool Src0IsSGPR = Src0.isReg() &&

                      RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));

    bool Src1IsSGPR = Src1.isReg() &&

                      RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));

    MachineInstr *Xor;

    Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);


    // Build a pair of scalar instructions and add them to the work list.

    // The next iteration over the work list will lower these to the vector

    // unit as necessary.

    if (Src0IsSGPR) {

      BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);

      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)

      .addReg(Temp)

      .add(Src1);

    } else if (Src1IsSGPR) {

      BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);

      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)

      .add(Src0)

      .addReg(Temp);

    } else {

      Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)

        .add(Src0)

        .add(Src1);

      MachineInstr *Not =

          BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);

      Worklist.insert(Not);

    }


    MRI.replaceRegWith(Dest.getReg(), NewDest);


    Worklist.insert(Xor);


    addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);

  }

}


void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,

                                      MachineInstr &Inst,

                                      unsigned Opcode) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);


  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);


  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)

    .add(Src0)

    .add(Src1);


  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)

    .addReg(Interm);


  Worklist.insert(&Op);

  Worklist.insert(&Not);


  MRI.replaceRegWith(Dest.getReg(), NewDest);

  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);

}


void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,

                                     MachineInstr &Inst,

                                     unsigned Opcode) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);


  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);


  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)

    .add(Src1);


  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)

    .add(Src0)

    .addReg(Interm);


  Worklist.insert(&Not);

  Worklist.insert(&Op);


  MRI.replaceRegWith(Dest.getReg(), NewDest);

  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);

}


void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,

                                          MachineInstr &Inst, unsigned Opcode,

                                          bool Swap) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  DebugLoc DL = Inst.getDebugLoc();


  MachineBasicBlock::iterator MII = Inst;


  const MCInstrDesc &InstDesc = get(Opcode);

  const TargetRegisterClass *Src0RC = Src0.isReg() ?

    MRI.getRegClass(Src0.getReg()) :

    &AMDGPU::SGPR_32RegClass;


  const TargetRegisterClass *Src0SubRC =

      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);


  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,

                                                       AMDGPU::sub0, Src0SubRC);


  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());

  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);

  const TargetRegisterClass *NewDestSubRC =

      RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);


  Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);

  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);


  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,

                                                       AMDGPU::sub1, Src0SubRC);


  Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);

  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);


  if (Swap)

    std::swap(DestSub0, DestSub1);


  Register FullDestReg = MRI.createVirtualRegister(NewDestRC);

  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)

    .addReg(DestSub0)

    .addImm(AMDGPU::sub0)

    .addReg(DestSub1)

    .addImm(AMDGPU::sub1);


  MRI.replaceRegWith(Dest.getReg(), FullDestReg);


  Worklist.insert(&LoHalf);

  Worklist.insert(&HiHalf);


  // We don't need to legalizeOperands here because for a single operand, src0

  // will support any kind of input.


  // Move all users of this moved value.

  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);

}


// There is not a vector equivalent of s_mul_u64. For this reason, we need to

// split the s_mul_u64 in 32-bit vector multiplications.

void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,

                                     MachineInstr &Inst,

                                     MachineDominatorTree *MDT) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);

  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);

  const DebugLoc &DL = Inst.getDebugLoc();

  MachineBasicBlock::iterator MII = Inst;


  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());

  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());

  const TargetRegisterClass *Src0SubRC =

      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);

  if (RI.isSGPRClass(Src0SubRC))

    Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);

  const TargetRegisterClass *Src1SubRC =

      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);

  if (RI.isSGPRClass(Src1SubRC))

    Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);


  // First, we extract the low 32-bit and high 32-bit values from each of the

  // operands.

  MachineOperand Op0L =

      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);

  MachineOperand Op1L =

      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);

  MachineOperand Op0H =

      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);

  MachineOperand Op1H =

      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);


  // The multilication is done as follows:

  //

  //                            Op1H  Op1L

  //                          * Op0H  Op0L

  //                       --------------------

  //                       Op1H*Op0L  Op1L*Op0L

  //          + Op1H*Op0H  Op1L*Op0H

  // -----------------------------------------

  // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L

  //

  //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit

  //  value and that would overflow.

  //  The low 32-bit value is Op1L*Op0L.

  //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).


  Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  MachineInstr *Op1L_Op0H =

      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)

          .add(Op1L)

          .add(Op0H);


  Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  MachineInstr *Op1H_Op0L =

      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)

          .add(Op1H)

          .add(Op0L);


  Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  MachineInstr *Carry =

      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)

          .add(Op1L)

          .add(Op0L);


  MachineInstr *LoHalf =

      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)

          .add(Op1L)

          .add(Op0L);


  Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)

                          .addReg(Op1L_Op0H_Reg)

                          .addReg(Op1H_Op0L_Reg);


  MachineInstr *HiHalf =

      BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)

          .addReg(AddReg)

          .addReg(CarryReg);


  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)

      .addReg(DestSub0)

      .addImm(AMDGPU::sub0)

      .addReg(DestSub1)

      .addImm(AMDGPU::sub1);


  MRI.replaceRegWith(Dest.getReg(), FullDestReg);


  // Try to legalize the operands in case we need to swap the order to keep it

  // valid.

  legalizeOperands(*Op1L_Op0H, MDT);

  legalizeOperands(*Op1H_Op0L, MDT);

  legalizeOperands(*Carry, MDT);

  legalizeOperands(*LoHalf, MDT);

  legalizeOperands(*Add, MDT);

  legalizeOperands(*HiHalf, MDT);


  // Move all users of this moved value.

  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);

}


// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector

// multiplications.

void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,

                                        MachineInstr &Inst,

                                        MachineDominatorTree *MDT) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);

  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);

  const DebugLoc &DL = Inst.getDebugLoc();

  MachineBasicBlock::iterator MII = Inst;


  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());

  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());

  const TargetRegisterClass *Src0SubRC =

      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);

  if (RI.isSGPRClass(Src0SubRC))

    Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);

  const TargetRegisterClass *Src1SubRC =

      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);

  if (RI.isSGPRClass(Src1SubRC))

    Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);


  // First, we extract the low 32-bit and high 32-bit values from each of the

  // operands.

  MachineOperand Op0L =

      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);

  MachineOperand Op1L =

      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);


  unsigned Opc = Inst.getOpcode();

  unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO

                        ? AMDGPU::V_MUL_HI_U32_e64

                        : AMDGPU::V_MUL_HI_I32_e64;

  MachineInstr *HiHalf =

      BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);


  MachineInstr *LoHalf =

      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)

          .add(Op1L)

          .add(Op0L);


  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)

      .addReg(DestSub0)

      .addImm(AMDGPU::sub0)

      .addReg(DestSub1)

      .addImm(AMDGPU::sub1);


  MRI.replaceRegWith(Dest.getReg(), FullDestReg);


  // Try to legalize the operands in case we need to swap the order to keep it

  // valid.

  legalizeOperands(*HiHalf, MDT);

  legalizeOperands(*LoHalf, MDT);


  // Move all users of this moved value.

  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);

}


void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,

                                           MachineInstr &Inst, unsigned Opcode,

                                           MachineDominatorTree *MDT) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);

  DebugLoc DL = Inst.getDebugLoc();


  MachineBasicBlock::iterator MII = Inst;


  const MCInstrDesc &InstDesc = get(Opcode);

  const TargetRegisterClass *Src0RC = Src0.isReg() ?

    MRI.getRegClass(Src0.getReg()) :

    &AMDGPU::SGPR_32RegClass;


  const TargetRegisterClass *Src0SubRC =

      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);

  const TargetRegisterClass *Src1RC = Src1.isReg() ?

    MRI.getRegClass(Src1.getReg()) :

    &AMDGPU::SGPR_32RegClass;


  const TargetRegisterClass *Src1SubRC =

      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);


  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,

                                                       AMDGPU::sub0, Src0SubRC);

  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,

                                                       AMDGPU::sub0, Src1SubRC);

  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,

                                                       AMDGPU::sub1, Src0SubRC);

  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,

                                                       AMDGPU::sub1, Src1SubRC);


  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());

  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);

  const TargetRegisterClass *NewDestSubRC =

      RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);


  Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);

  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)

                              .add(SrcReg0Sub0)

                              .add(SrcReg1Sub0);


  Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);

  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)

                              .add(SrcReg0Sub1)

                              .add(SrcReg1Sub1);


  Register FullDestReg = MRI.createVirtualRegister(NewDestRC);

  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)

    .addReg(DestSub0)

    .addImm(AMDGPU::sub0)

    .addReg(DestSub1)

    .addImm(AMDGPU::sub1);


  MRI.replaceRegWith(Dest.getReg(), FullDestReg);


  Worklist.insert(&LoHalf);

  Worklist.insert(&HiHalf);


  // Move all users of this moved value.

  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);

}


void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,

                                       MachineInstr &Inst,

                                       MachineDominatorTree *MDT) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineBasicBlock::iterator MII = Inst;


  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());


  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);


  MachineOperand* Op0;

  MachineOperand* Op1;


  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {

    Op0 = &Src0;

    Op1 = &Src1;

  } else {

    Op0 = &Src1;

    Op1 = &Src0;

  }


  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)

    .add(*Op0);


  Register NewDest = MRI.createVirtualRegister(DestRC);


  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)

    .addReg(Interm)

    .add(*Op1);


  MRI.replaceRegWith(Dest.getReg(), NewDest);


  Worklist.insert(&Xor);

}


void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,

                                       MachineInstr &Inst) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  MachineBasicBlock::iterator MII = Inst;

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src = Inst.getOperand(1);


  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);

  const TargetRegisterClass *SrcRC = Src.isReg() ?

    MRI.getRegClass(Src.getReg()) :

    &AMDGPU::SGPR_32RegClass;


  Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  const TargetRegisterClass *SrcSubRC =

      RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);


  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,

                                                      AMDGPU::sub0, SrcSubRC);

  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,

                                                      AMDGPU::sub1, SrcSubRC);


  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);


  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);


  MRI.replaceRegWith(Dest.getReg(), ResultReg);


  // We don't need to legalize operands here. src0 for either instruction can be

  // an SGPR, and the second input is unused or determined here.

  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);

}


void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,

                                      MachineInstr &Inst) const {

  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  uint32_t Imm = Inst.getOperand(2).getImm();

  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].

  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].


  (void) Offset;


  // Only sext_inreg cases handled.

  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&

         Offset == 0 && "Not implemented");


  if (BitWidth < 32) {

    Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);


    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)

        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)

        .addImm(0)

        .addImm(BitWidth);


    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)

      .addImm(31)

      .addReg(MidRegLo);


    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)

      .addReg(MidRegLo)

      .addImm(AMDGPU::sub0)

      .addReg(MidRegHi)

      .addImm(AMDGPU::sub1);


    MRI.replaceRegWith(Dest.getReg(), ResultReg);

    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);

    return;

  }


  MachineOperand &Src = Inst.getOperand(1);

  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);


  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)

    .addImm(31)

    .addReg(Src.getReg(), 0, AMDGPU::sub0);


  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)

    .addReg(Src.getReg(), 0, AMDGPU::sub0)

    .addImm(AMDGPU::sub0)

    .addReg(TmpReg)

    .addImm(AMDGPU::sub1);


  MRI.replaceRegWith(Dest.getReg(), ResultReg);

  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);

}


void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,

                                          MachineInstr &Inst, unsigned Opcode,

                                          MachineDominatorTree *MDT) const {

  //  (S_FLBIT_I32_B64 hi:lo) ->

  // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))

  //  (S_FF1_I32_B64 hi:lo) ->

  // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))


  MachineBasicBlock &MBB = *Inst.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  MachineBasicBlock::iterator MII = Inst;

  const DebugLoc &DL = Inst.getDebugLoc();


  MachineOperand &Dest = Inst.getOperand(0);

  MachineOperand &Src = Inst.getOperand(1);


  const MCInstrDesc &InstDesc = get(Opcode);


  bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;

  unsigned OpcodeAdd =

      ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;


  const TargetRegisterClass *SrcRC =

      Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;

  const TargetRegisterClass *SrcSubRC =

      RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);


  MachineOperand SrcRegSub0 =

      buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);

  MachineOperand SrcRegSub1 =

      buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);


  Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);


  BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);


  BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)

      .addReg(IsCtlz ? MidReg1 : MidReg2)

      .addImm(32)

      .addImm(1); // enable clamp


  BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)

      .addReg(MidReg3)

      .addReg(IsCtlz ? MidReg2 : MidReg1);


  MRI.replaceRegWith(Dest.getReg(), MidReg4);


  addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);

}


void SIInstrInfo::addUsersToMoveToVALUWorklist(

    Register DstReg, MachineRegisterInfo &MRI,

    SIInstrWorklist &Worklist) const {

  for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {

    MachineInstr &UseMI = *MO.getParent();


    unsigned OpNo = 0;


    switch (UseMI.getOpcode()) {

    case AMDGPU::COPY:

    case AMDGPU::WQM:

    case AMDGPU::SOFT_WQM:

    case AMDGPU::STRICT_WWM:

    case AMDGPU::STRICT_WQM:

    case AMDGPU::REG_SEQUENCE:

    case AMDGPU::PHI:

    case AMDGPU::INSERT_SUBREG:

      break;

    default:

      OpNo = MO.getOperandNo();

      break;

    }


    if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)))

      Worklist.insert(&UseMI);

    else

      // Legalization could change user list.

      legalizeOperandsVALUt16(UseMI, OpNo, MRI);

  }

}


void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,

                                 MachineRegisterInfo &MRI,

                                 MachineInstr &Inst) const {

  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  MachineBasicBlock *MBB = Inst.getParent();

  MachineOperand &Src0 = Inst.getOperand(1);

  MachineOperand &Src1 = Inst.getOperand(2);

  const DebugLoc &DL = Inst.getDebugLoc();


  switch (Inst.getOpcode()) {

  case AMDGPU::S_PACK_LL_B32_B16: {

    Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


    // FIXME: Can do a lot better if we know the high bits of src0 or src1 are

    // 0.

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)

      .addImm(0xffff);


    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)

      .addReg(ImmReg, RegState::Kill)

      .add(Src0);


    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)

      .add(Src1)

      .addImm(16)

      .addReg(TmpReg, RegState::Kill);

    break;

  }

  case AMDGPU::S_PACK_LH_B32_B16: {

    Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)

      .addImm(0xffff);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)

      .addReg(ImmReg, RegState::Kill)

      .add(Src0)

      .add(Src1);

    break;

  }

  case AMDGPU::S_PACK_HL_B32_B16: {

    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)

        .addImm(16)

        .add(Src0);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)

        .add(Src1)

        .addImm(16)

        .addReg(TmpReg, RegState::Kill);

    break;

  }

  case AMDGPU::S_PACK_HH_B32_B16: {

    Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)

      .addImm(16)

      .add(Src0);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)

      .addImm(0xffff0000);

    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)

      .add(Src1)

      .addReg(ImmReg, RegState::Kill)

      .addReg(TmpReg, RegState::Kill);

    break;

  }

  default:

    llvm_unreachable("unhandled s_pack_* instruction");

  }


  MachineOperand &Dest = Inst.getOperand(0);

  MRI.replaceRegWith(Dest.getReg(), ResultReg);

  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);

}


void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,

                                               MachineInstr &SCCDefInst,

                                               SIInstrWorklist &Worklist,

                                               Register NewCond) const {


  // Ensure that def inst defines SCC, which is still live.

  assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&

         !Op.isDead() && Op.getParent() == &SCCDefInst);

  SmallVector<MachineInstr *, 4> CopyToDelete;

  // This assumes that all the users of SCC are in the same block

  // as the SCC def.

  for (MachineInstr &MI : // Skip the def inst itself.

       make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),

                  SCCDefInst.getParent()->end())) {

    // Check if SCC is used first.

    int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);

    if (SCCIdx != -1) {

      if (MI.isCopy()) {

        MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

        Register DestReg = MI.getOperand(0).getReg();


        MRI.replaceRegWith(DestReg, NewCond);

        CopyToDelete.push_back(&MI);

      } else {


        if (NewCond.isValid())

          MI.getOperand(SCCIdx).setReg(NewCond);


        Worklist.insert(&MI);

      }

    }

    // Exit if we find another SCC def.

    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)

      break;

  }

  for (auto &Copy : CopyToDelete)

    Copy->eraseFromParent();

}


// Instructions that use SCC may be converted to VALU instructions. When that

// happens, the SCC register is changed to VCC_LO. The instruction that defines

// SCC must be changed to an instruction that defines VCC. This function makes

// sure that the instruction that defines SCC is added to the moveToVALU

// worklist.

void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,

                                           SIInstrWorklist &Worklist) const {

  // Look for a preceding instruction that either defines VCC or SCC. If VCC

  // then there is nothing to do because the defining instruction has been

  // converted to a VALU already. If SCC then that instruction needs to be

  // converted to a VALU.

  for (MachineInstr &MI :

       make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),

                  SCCUseInst->getParent()->rend())) {

    if (MI.modifiesRegister(AMDGPU::VCC, &RI))

      break;

    if (MI.definesRegister(AMDGPU::SCC, &RI)) {

      Worklist.insert(&MI);

      break;

    }

  }

}


const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(

  const MachineInstr &Inst) const {

  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);


  switch (Inst.getOpcode()) {

  // For target instructions, getOpRegClass just returns the virtual register

  // class associated with the operand, so we need to find an equivalent VGPR

  // register class in order to move the instruction to the VALU.

  case AMDGPU::COPY:

  case AMDGPU::PHI:

  case AMDGPU::REG_SEQUENCE:

  case AMDGPU::INSERT_SUBREG:

  case AMDGPU::WQM:

  case AMDGPU::SOFT_WQM:

  case AMDGPU::STRICT_WWM:

  case AMDGPU::STRICT_WQM: {

    const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);

    if (RI.isAGPRClass(SrcRC)) {

      if (RI.isAGPRClass(NewDstRC))

        return nullptr;


      switch (Inst.getOpcode()) {

      case AMDGPU::PHI:

      case AMDGPU::REG_SEQUENCE:

      case AMDGPU::INSERT_SUBREG:

        NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);

        break;

      default:

        NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);

      }


      if (!NewDstRC)

        return nullptr;

    } else {

      if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)

        return nullptr;


      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);

      if (!NewDstRC)

        return nullptr;

    }


    return NewDstRC;

  }

  default:

    return NewDstRC;

  }

}


// Find the one SGPR operand we are allowed to use.

Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,

                                   int OpIndices[3]) const {

  const MCInstrDesc &Desc = MI.getDesc();


  // Find the one SGPR operand we are allowed to use.

  //

  // First we need to consider the instruction's operand requirements before

  // legalizing. Some operands are required to be SGPRs, such as implicit uses

  // of VCC, but we are still bound by the constant bus requirement to only use

  // one.

  //

  // If the operand's class is an SGPR, we can never move it.


  Register SGPRReg = findImplicitSGPRRead(MI);

  if (SGPRReg)

    return SGPRReg;


  Register UsedSGPRs[3] = {Register()};

  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();


  for (unsigned i = 0; i < 3; ++i) {

    int Idx = OpIndices[i];

    if (Idx == -1)

      break;


    const MachineOperand &MO = MI.getOperand(Idx);

    if (!MO.isReg())

      continue;


    // Is this operand statically required to be an SGPR based on the operand

    // constraints?

    const TargetRegisterClass *OpRC =

        RI.getRegClass(Desc.operands()[Idx].RegClass);

    bool IsRequiredSGPR = RI.isSGPRClass(OpRC);

    if (IsRequiredSGPR)

      return MO.getReg();


    // If this could be a VGPR or an SGPR, Check the dynamic register class.

    Register Reg = MO.getReg();

    const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);

    if (RI.isSGPRClass(RegRC))

      UsedSGPRs[i] = Reg;

  }


  // We don't have a required SGPR operand, so we have a bit more freedom in

  // selecting operands to move.


  // Try to select the most used SGPR. If an SGPR is equal to one of the

  // others, we choose that.

  //

  // e.g.

  // V_FMA_F32 v0, s0, s0, s0 -> No moves

  // V_FMA_F32 v0, s0, s1, s0 -> Move s1


  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should

  // prefer those.


  if (UsedSGPRs[0]) {

    if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])

      SGPRReg = UsedSGPRs[0];

  }


  if (!SGPRReg && UsedSGPRs[1]) {

    if (UsedSGPRs[1] == UsedSGPRs[2])

      SGPRReg = UsedSGPRs[1];

  }


  return SGPRReg;

}


MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,

                                             AMDGPU::OpName OperandName) const {

  if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)

    return nullptr;


  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);

  if (Idx == -1)

    return nullptr;


  return &MI.getOperand(Idx);

}


uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {

  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {

    int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11

                         ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT

                         : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT;

    return (Format << 44) |

           (1ULL << 56) | // RESOURCE_LEVEL = 1

           (3ULL << 60); // OOB_SELECT = 3

  }


  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;

  if (ST.isAmdHsaOS()) {

    // Set ATC = 1. GFX9 doesn't have this bit.

    if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      RsrcDataFormat |= (1ULL << 56);


    // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.

    // BTW, it disables TC L2 and therefore decreases performance.

    if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)

      RsrcDataFormat |= (2ULL << 59);

  }


  return RsrcDataFormat;

}


uint64_t SIInstrInfo::getScratchRsrcWords23() const {

  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |

                    AMDGPU::RSRC_TID_ENABLE |

                    0xffffffff; // Size;


  // GFX9 doesn't have ELEMENT_SIZE.

  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {

    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;

    Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;

  }


  // IndexStride = 64 / 32.

  uint64_t IndexStride = ST.isWave64() ? 3 : 2;

  Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;


  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].

  // Clear them unless we want a huge stride.

  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&

      ST.getGeneration() <= AMDGPUSubtarget::GFX9)

    Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;


  return Rsrc23;

}


bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {

  unsigned Opc = MI.getOpcode();


  return isSMRD(Opc);

}


bool SIInstrInfo::isHighLatencyDef(int Opc) const {

  return get(Opc).mayLoad() &&

         (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));

}


Register SIInstrInfo::isStackAccess(const MachineInstr &MI,

                                    int &FrameIndex) const {

  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);

  if (!Addr || !Addr->isFI())

    return Register();


  assert(!MI.memoperands_empty() &&

         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);


  FrameIndex = Addr->getIndex();

  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();

}


Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,

                                        int &FrameIndex) const {

  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);

  assert(Addr && Addr->isFI());

  FrameIndex = Addr->getIndex();

  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();

}


Register SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,

                                          int &FrameIndex) const {

  if (!MI.mayLoad())

    return Register();


  if (isMUBUF(MI) || isVGPRSpill(MI))

    return isStackAccess(MI, FrameIndex);


  if (isSGPRSpill(MI))

    return isSGPRStackAccess(MI, FrameIndex);


  return Register();

}


Register SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,

                                         int &FrameIndex) const {

  if (!MI.mayStore())

    return Register();


  if (isMUBUF(MI) || isVGPRSpill(MI))

    return isStackAccess(MI, FrameIndex);


  if (isSGPRSpill(MI))

    return isSGPRStackAccess(MI, FrameIndex);


  return Register();

}


unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {

  unsigned Size = 0;

  MachineBasicBlock::const_instr_iterator I = MI.getIterator();

  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();

  while (++I != E && I->isInsideBundle()) {

    assert(!I->isBundle() && "No nested bundle!");

    Size += getInstSizeInBytes(*I);

  }


  return Size;

}


unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {

  unsigned Opc = MI.getOpcode();

  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);

  unsigned DescSize = Desc.getSize();


  // If we have a definitive size, we can use it. Otherwise we need to inspect

  // the operands to know the size.

  if (isFixedSize(MI)) {

    unsigned Size = DescSize;


    // If we hit the buggy offset, an extra nop will be inserted in MC so

    // estimate the worst case.

    if (MI.isBranch() && ST.hasOffset3fBug())

      Size += 4;


    return Size;

  }


  // Instructions may have a 32-bit literal encoded after them. Check

  // operands that could ever be literals.

  if (isVALU(MI) || isSALU(MI)) {

    if (isDPP(MI))

      return DescSize;

    bool HasLiteral = false;

    unsigned LiteralSize = 4;

    for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {

      const MachineOperand &Op = MI.getOperand(I);

      const MCOperandInfo &OpInfo = Desc.operands()[I];

      if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {

        HasLiteral = true;

        if (ST.has64BitLiterals()) {

          switch (OpInfo.OperandType) {

          default:

            break;

          case AMDGPU::OPERAND_REG_IMM_FP64:

            if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))

              LiteralSize = 8;

            break;

          case AMDGPU::OPERAND_REG_IMM_INT64:

            if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))

              LiteralSize = 8;

            break;

          }

        }

        break;

      }

    }

    return HasLiteral ? DescSize + LiteralSize : DescSize;

  }


  // Check whether we have extra NSA words.

  if (isMIMG(MI)) {

    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);

    if (VAddr0Idx < 0)

      return 8;


    int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);

    return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);

  }


  switch (Opc) {

  case TargetOpcode::BUNDLE:

    return getInstBundleSize(MI);

  case TargetOpcode::INLINEASM:

  case TargetOpcode::INLINEASM_BR: {

    const MachineFunction *MF = MI.getParent()->getParent();

    const char *AsmStr = MI.getOperand(0).getSymbolName();

    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);

  }

  default:

    if (MI.isMetaInstruction())

      return 0;


    // If D16 Pseudo inst, get correct MC code size

    const auto *D16Info = AMDGPU::getT16D16Helper(Opc);

    if (D16Info) {

      // Assume d16_lo/hi inst are always in same size

      unsigned LoInstOpcode = D16Info->LoOp;

      const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);

      DescSize = Desc.getSize();

    }


    return DescSize;

  }

}


bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {

  if (!isFLAT(MI))

    return false;


  if (MI.memoperands_empty())

    return true;


  for (const MachineMemOperand *MMO : MI.memoperands()) {

    if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)

      return true;

  }

  return false;

}


ArrayRef<std::pair<int, const char *>>


SIInstrInfo::getSerializableTargetIndices() const {

  static const std::pair<int, const char *> TargetIndices[] = {

      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},

      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},

      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},

      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},

      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};

  return ArrayRef(TargetIndices);

}


/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The

/// post-RA version of misched uses CreateTargetMIHazardRecognizer.

ScheduleHazardRecognizer *


SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,

                                            const ScheduleDAG *DAG) const {

  return new GCNHazardRecognizer(DAG->MF);

}


/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer

/// pass.

ScheduleHazardRecognizer *


SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {

  return new GCNHazardRecognizer(MF);

}


// Called during:

// - pre-RA scheduling and post-RA scheduling

ScheduleHazardRecognizer *


SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,

                                            const ScheduleDAGMI *DAG) const {

  // Borrowed from Arm Target

  // We would like to restrict this hazard recognizer to only

  // post-RA scheduling; we can tell that we're post-RA because we don't

  // track VRegLiveness.

  if (!DAG->hasVRegLiveness())

    return new GCNHazardRecognizer(DAG->MF);

  return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);

}


std::pair<unsigned, unsigned>


SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {

  return std::pair(TF & MO_MASK, TF & ~MO_MASK);

}


ArrayRef<std::pair<unsigned, const char *>>


SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {

  static const std::pair<unsigned, const char *> TargetFlags[] = {

      {MO_GOTPCREL, "amdgpu-gotprel"},

      {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},

      {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},

      {MO_GOTPCREL64, "amdgpu-gotprel64"},

      {MO_REL32_LO, "amdgpu-rel32-lo"},

      {MO_REL32_HI, "amdgpu-rel32-hi"},

      {MO_REL64, "amdgpu-rel64"},

      {MO_ABS32_LO, "amdgpu-abs32-lo"},

      {MO_ABS32_HI, "amdgpu-abs32-hi"},

      {MO_ABS64, "amdgpu-abs64"},

  };


  return ArrayRef(TargetFlags);

}


ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>


SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {

  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =

      {

          {MONoClobber, "amdgpu-noclobber"},

          {MOLastUse, "amdgpu-last-use"},

          {MOCooperative, "amdgpu-cooperative"},

      };


  return ArrayRef(TargetFlags);

}


unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,

                                              const MachineFunction &MF) const {

  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  assert(SrcReg.isVirtual());

  if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))

    return AMDGPU::WWM_COPY;


  return AMDGPU::COPY;

}


bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,

                                       Register Reg) const {

  // We need to handle instructions which may be inserted during register

  // allocation to handle the prolog. The initial prolog instruction may have

  // been separated from the start of the block by spills and copies inserted

  // needed by the prolog. However, the insertions for scalar registers can

  // always be placed at the BB top as they are independent of the exec mask

  // value.

  const MachineFunction *MF = MI.getParent()->getParent();

  bool IsNullOrVectorRegister = true;

  if (Reg) {

    const MachineRegisterInfo &MRI = MF->getRegInfo();

    IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));

  }


  uint16_t Opcode = MI.getOpcode();

  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

  return IsNullOrVectorRegister &&

         (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||

          (Opcode == AMDGPU::IMPLICIT_DEF &&

           MFI->isWWMReg(MI.getOperand(0).getReg())) ||

          (!MI.isTerminator() && Opcode != AMDGPU::COPY &&

           MI.modifiesRegister(AMDGPU::EXEC, &RI)));

}


MachineInstrBuilder


SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,

                           MachineBasicBlock::iterator I,

                           const DebugLoc &DL,

                           Register DestReg) const {

  if (ST.hasAddNoCarry())

    return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);


  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

  Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());

  MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());


  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)

           .addReg(UnusedCarry, RegState::Define | RegState::Dead);

}


MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,

                                               MachineBasicBlock::iterator I,

                                               const DebugLoc &DL,

                                               Register DestReg,

                                               RegScavenger &RS) const {

  if (ST.hasAddNoCarry())

    return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);


  // If available, prefer to use vcc.

  Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)

                             ? Register(RI.getVCC())

                             : RS.scavengeRegisterBackwards(

                                   *RI.getBoolRC(), I, /* RestoreAfter */ false,

                                   0, /* AllowSpill */ false);


  // TODO: Users need to deal with this.

  if (!UnusedCarry.isValid())

    return MachineInstrBuilder();


  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)

           .addReg(UnusedCarry, RegState::Define | RegState::Dead);

}


bool SIInstrInfo::isKillTerminator(unsigned Opcode) {

  switch (Opcode) {

  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:

  case AMDGPU::SI_KILL_I1_TERMINATOR:

    return true;

  default:

    return false;

  }

}


const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {

  switch (Opcode) {

  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:

    return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);

  case AMDGPU::SI_KILL_I1_PSEUDO:

    return get(AMDGPU::SI_KILL_I1_TERMINATOR);

  default:

    llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");

  }

}


bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {

  return Imm <= getMaxMUBUFImmOffset(ST);

}


unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {

  // GFX12 field is non-negative 24-bit signed byte offset.

  const unsigned OffsetBits =

      ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;

  return (1 << OffsetBits) - 1;

}


void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {

  if (!ST.isWave32())

    return;


  if (MI.isInlineAsm())

    return;


  for (auto &Op : MI.implicit_operands()) {

    if (Op.isReg() && Op.getReg() == AMDGPU::VCC)

      Op.setReg(AMDGPU::VCC_LO);

  }

}


bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {

  if (!isSMRD(MI))

    return false;


  // Check that it is using a buffer resource.

  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);

  if (Idx == -1) // e.g. s_memtime

    return false;


  const auto RCID = MI.getDesc().operands()[Idx].RegClass;

  return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);

}


// Given Imm, split it into the values to put into the SOffset and ImmOffset

// fields in an MUBUF instruction. Return false if it is not possible (due to a

// hardware bug needing a workaround).

//

// The required alignment ensures that individual address components remain

// aligned if they are aligned to begin with. It also ensures that additional

// offsets within the given alignment can be added to the resulting ImmOffset.


bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,

                                   uint32_t &ImmOffset, Align Alignment) const {

  const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);

  const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());

  uint32_t Overflow = 0;


  if (Imm > MaxImm) {

    if (Imm <= MaxImm + 64) {

      // Use an SOffset inline constant for 4..64

      Overflow = Imm - MaxImm;

      Imm = MaxImm;

    } else {

      // Try to keep the same value in SOffset for adjacent loads, so that

      // the corresponding register contents can be re-used.

      //

      // Load values with all low-bits (except for alignment bits) set into

      // SOffset, so that a larger range of values can be covered using

      // s_movk_i32.

      //

      // Atomic operations fail to work correctly when individual address

      // components are unaligned, even if their sum is aligned.

      uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;

      uint32_t Low = (Imm + Alignment.value()) & MaxOffset;

      Imm = Low;

      Overflow = High - Alignment.value();

    }

  }


  if (Overflow > 0) {

    // There is a hardware bug in SI and CI which prevents address clamping in

    // MUBUF instructions from working correctly with SOffsets. The immediate

    // offset is unaffected.

    if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)

      return false;


    // It is not possible to set immediate in SOffset field on some targets.

    if (ST.hasRestrictedSOffset())

      return false;

  }


  ImmOffset = Imm;

  SOffset = Overflow;

  return true;

}


// Depending on the used address space and instructions, some immediate offsets

// are allowed and some are not.

// Pre-GFX12, flat instruction offsets can only be non-negative, global and

// scratch instruction offsets can also be negative. On GFX12, offsets can be

// negative for all variants.

//

// There are several bugs related to these offsets:

// On gfx10.1, flat instructions that go into the global address space cannot

// use an offset.

//

// For scratch instructions, the address can be either an SGPR or a VGPR.

// The following offsets can be used, depending on the architecture (x means

// cannot be used):

// +----------------------------+------+------+

// | Address-Mode               | SGPR | VGPR |

// +----------------------------+------+------+

// | gfx9                       |      |      |

// | negative, 4-aligned offset | x    | ok   |

// | negative, unaligned offset | x    | ok   |

// +----------------------------+------+------+

// | gfx10                      |      |      |

// | negative, 4-aligned offset | ok   | ok   |

// | negative, unaligned offset | ok   | x    |

// +----------------------------+------+------+

// | gfx10.3                    |      |      |

// | negative, 4-aligned offset | ok   | ok   |

// | negative, unaligned offset | ok   | ok   |

// +----------------------------+------+------+

//

// This function ignores the addressing mode, so if an offset cannot be used in

// one addressing mode, it is considered illegal.


bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,

                                    uint64_t FlatVariant) const {

  // TODO: Should 0 be special cased?

  if (!ST.hasFlatInstOffsets())

    return false;


  if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&

      (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||

       AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))

    return false;


  if (ST.hasNegativeUnalignedScratchOffsetBug() &&

      FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&

      (Offset % 4) != 0) {

    return false;

  }


  bool AllowNegative = allowNegativeFlatOffset(FlatVariant);

  unsigned N = AMDGPU::getNumFlatOffsetBits(ST);

  return isIntN(N, Offset) && (AllowNegative || Offset >= 0);

}


// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.

std::pair<int64_t, int64_t>


SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,

                             uint64_t FlatVariant) const {

  int64_t RemainderOffset = COffsetVal;

  int64_t ImmField = 0;


  bool AllowNegative = allowNegativeFlatOffset(FlatVariant);

  const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;


  if (AllowNegative) {

    // Use signed division by a power of two to truncate towards 0.

    int64_t D = 1LL << NumBits;

    RemainderOffset = (COffsetVal / D) * D;

    ImmField = COffsetVal - RemainderOffset;


    if (ST.hasNegativeUnalignedScratchOffsetBug() &&

        FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&

        (ImmField % 4) != 0) {

      // Make ImmField a multiple of 4

      RemainderOffset += ImmField % 4;

      ImmField -= ImmField % 4;

    }

  } else if (COffsetVal >= 0) {

    ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);

    RemainderOffset = COffsetVal - ImmField;

  }


  assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));

  assert(RemainderOffset + ImmField == COffsetVal);

  return {ImmField, RemainderOffset};

}


bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {

  if (ST.hasNegativeScratchOffsetBug() &&

      FlatVariant == SIInstrFlags::FlatScratch)

    return false;


  return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);

}


static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {

  switch (ST.getGeneration()) {

  default:

    break;

  case AMDGPUSubtarget::SOUTHERN_ISLANDS:

  case AMDGPUSubtarget::SEA_ISLANDS:

    return SIEncodingFamily::SI;

  case AMDGPUSubtarget::VOLCANIC_ISLANDS:

  case AMDGPUSubtarget::GFX9:

    return SIEncodingFamily::VI;

  case AMDGPUSubtarget::GFX10:

    return SIEncodingFamily::GFX10;

  case AMDGPUSubtarget::GFX11:

    return SIEncodingFamily::GFX11;

  case AMDGPUSubtarget::GFX12:

    return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250

                                : SIEncodingFamily::GFX12;

  }

  llvm_unreachable("Unknown subtarget generation!");

}


bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {

  switch(MCOp) {

  // These opcodes use indirect register addressing so

  // they need special handling by codegen (currently missing).

  // Therefore it is too risky to allow these opcodes

  // to be selected by dpp combiner or sdwa peepholer.

  case AMDGPU::V_MOVRELS_B32_dpp_gfx10:

  case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:

  case AMDGPU::V_MOVRELD_B32_dpp_gfx10:

  case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:

  case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:

  case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:

  case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:

  case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:

    return true;

  default:

    return false;

  }

}


#define GENERATE_RENAMED_GFX9_CASES(OPCODE)                                    \

  case OPCODE##_dpp:                                                           \

  case OPCODE##_e32:                                                           \

  case OPCODE##_e64:                                                           \

  case OPCODE##_e64_dpp:                                                       \

  case OPCODE##_sdwa:


static bool isRenamedInGFX9(int Opcode) {

  switch (Opcode) {

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)

    GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)

  //

  case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:

  case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:

  case AMDGPU::V_FMA_F16_gfx9_e64:

  case AMDGPU::V_FMA_F16_gfx9_fake16_e64:

  case AMDGPU::V_INTERP_P2_F16:

  case AMDGPU::V_MAD_F16_e64:

  case AMDGPU::V_MAD_U16_e64:

  case AMDGPU::V_MAD_I16_e64:

    return true;

  default:

    return false;

  }

}


int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {

  Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);


  unsigned Gen = subtargetEncodingFamily(ST);


  if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))

    Gen = SIEncodingFamily::GFX9;


  // Adjust the encoding family to GFX80 for D16 buffer instructions when the

  // subtarget has UnpackedD16VMem feature.

  // TODO: remove this when we discard GFX80 encoding.

  if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))

    Gen = SIEncodingFamily::GFX80;


  if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {

    switch (ST.getGeneration()) {

    default:

      Gen = SIEncodingFamily::SDWA;

      break;

    case AMDGPUSubtarget::GFX9:

      Gen = SIEncodingFamily::SDWA9;

      break;

    case AMDGPUSubtarget::GFX10:

      Gen = SIEncodingFamily::SDWA10;

      break;

    }

  }


  if (isMAI(Opcode)) {

    int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);

    if (MFMAOp != -1)

      Opcode = MFMAOp;

  }


  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);


  if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())

    MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);


  // -1 means that Opcode is already a native instruction.

  if (MCOp == -1)

    return Opcode;


  if (ST.hasGFX90AInsts()) {

    uint16_t NMCOp = (uint16_t)-1;

    if (ST.hasGFX940Insts())

      NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);

    if (NMCOp == (uint16_t)-1)

      NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);

    if (NMCOp == (uint16_t)-1)

      NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);

    if (NMCOp != (uint16_t)-1)

      MCOp = NMCOp;

  }


  // (uint16_t)-1 means that Opcode is a pseudo instruction that has

  // no encoding in the given subtarget generation.

  if (MCOp == (uint16_t)-1)

    return -1;


  if (isAsmOnlyOpcode(MCOp))

    return -1;


  return MCOp;

}


static


TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {

  assert(RegOpnd.isReg());

  return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :

                             getRegSubRegPair(RegOpnd);

}


TargetInstrInfo::RegSubRegPair


llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {

  assert(MI.isRegSequence());

  for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)

    if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {

      auto &RegOp = MI.getOperand(1 + 2 * I);

      return getRegOrUndef(RegOp);

    }

  return TargetInstrInfo::RegSubRegPair();

}


// Try to find the definition of reg:subreg in subreg-manipulation pseudos

// Following a subreg of reg:subreg isn't supported


static bool followSubRegDef(MachineInstr &MI,

                            TargetInstrInfo::RegSubRegPair &RSR) {

  if (!RSR.SubReg)

    return false;

  switch (MI.getOpcode()) {

  default: break;

  case AMDGPU::REG_SEQUENCE:

    RSR = getRegSequenceSubReg(MI, RSR.SubReg);

    return true;

  // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg

  case AMDGPU::INSERT_SUBREG:

    if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())

      // inserted the subreg we're looking for

      RSR = getRegOrUndef(MI.getOperand(2));

    else { // the subreg in the rest of the reg

      auto R1 = getRegOrUndef(MI.getOperand(1));

      if (R1.SubReg) // subreg of subreg isn't supported

        return false;

      RSR.Reg = R1.Reg;

    }

    return true;

  }

  return false;

}


MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,

                                     MachineRegisterInfo &MRI) {

  assert(MRI.isSSA());

  if (!P.Reg.isVirtual())

    return nullptr;


  auto RSR = P;

  auto *DefInst = MRI.getVRegDef(RSR.Reg);

  while (auto *MI = DefInst) {

    DefInst = nullptr;

    switch (MI->getOpcode()) {

    case AMDGPU::COPY:

    case AMDGPU::V_MOV_B32_e32: {

      auto &Op1 = MI->getOperand(1);

      if (Op1.isReg() && Op1.getReg().isVirtual()) {

        if (Op1.isUndef())

          return nullptr;

        RSR = getRegSubRegPair(Op1);

        DefInst = MRI.getVRegDef(RSR.Reg);

      }

      break;

    }

    default:

      if (followSubRegDef(*MI, RSR)) {

        if (!RSR.Reg)

          return nullptr;

        DefInst = MRI.getVRegDef(RSR.Reg);

      }

    }

    if (!DefInst)

      return MI;

  }

  return nullptr;

}


bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,

                                      Register VReg,

                                      const MachineInstr &DefMI,

                                      const MachineInstr &UseMI) {

  assert(MRI.isSSA() && "Must be run on SSA");


  auto *TRI = MRI.getTargetRegisterInfo();

  auto *DefBB = DefMI.getParent();


  // Don't bother searching between blocks, although it is possible this block

  // doesn't modify exec.

  if (UseMI.getParent() != DefBB)

    return true;


  const int MaxInstScan = 20;

  int NumInst = 0;


  // Stop scan at the use.

  auto E = UseMI.getIterator();

  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {

    if (I->isDebugInstr())

      continue;


    if (++NumInst > MaxInstScan)

      return true;


    if (I->modifiesRegister(AMDGPU::EXEC, TRI))

      return true;

  }


  return false;

}


bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,

                                         Register VReg,

                                         const MachineInstr &DefMI) {

  assert(MRI.isSSA() && "Must be run on SSA");


  auto *TRI = MRI.getTargetRegisterInfo();

  auto *DefBB = DefMI.getParent();


  const int MaxUseScan = 10;

  int NumUse = 0;


  for (auto &Use : MRI.use_nodbg_operands(VReg)) {

    auto &UseInst = *Use.getParent();

    // Don't bother searching between blocks, although it is possible this block

    // doesn't modify exec.

    if (UseInst.getParent() != DefBB || UseInst.isPHI())

      return true;


    if (++NumUse > MaxUseScan)

      return true;

  }


  if (NumUse == 0)

    return false;


  const int MaxInstScan = 20;

  int NumInst = 0;


  // Stop scan when we have seen all the uses.

  for (auto I = std::next(DefMI.getIterator()); ; ++I) {

    assert(I != DefBB->end());


    if (I->isDebugInstr())

      continue;


    if (++NumInst > MaxInstScan)

      return true;


    for (const MachineOperand &Op : I->operands()) {

      // We don't check reg masks here as they're used only on calls:

      // 1. EXEC is only considered const within one BB

      // 2. Call should be a terminator instruction if present in a BB


      if (!Op.isReg())

        continue;


      Register Reg = Op.getReg();

      if (Op.isUse()) {

        if (Reg == VReg && --NumUse == 0)

          return false;

      } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))

        return true;

    }

  }

}


MachineInstr *SIInstrInfo::createPHIDestinationCopy(

    MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,

    const DebugLoc &DL, Register Src, Register Dst) const {

  auto Cur = MBB.begin();

  if (Cur != MBB.end())

    do {

      if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))

        return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);

      ++Cur;

    } while (Cur != MBB.end() && Cur != LastPHIIt);


  return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,

                                                   Dst);

}


MachineInstr *SIInstrInfo::createPHISourceCopy(

    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,

    const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {

  if (InsPt != MBB.end() &&

      (InsPt->getOpcode() == AMDGPU::SI_IF ||

       InsPt->getOpcode() == AMDGPU::SI_ELSE ||

       InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&

      InsPt->definesRegister(Src, /*TRI=*/nullptr)) {

    InsPt++;

    return BuildMI(MBB, InsPt, DL,

                   get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)

        .addReg(Src, 0, SrcSubReg)

        .addReg(AMDGPU::EXEC, RegState::Implicit);

  }

  return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,

                                              Dst);

}


bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }


MachineInstr *SIInstrInfo::foldMemoryOperandImpl(

    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,

    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,

    VirtRegMap *VRM) const {

  // This is a bit of a hack (copied from AArch64). Consider this instruction:

  //

  //   %0:sreg_32 = COPY $m0

  //

  // We explicitly chose SReg_32 for the virtual register so such a copy might

  // be eliminated by RegisterCoalescer. However, that may not be possible, and

  // %0 may even spill. We can't spill $m0 normally (it would require copying to

  // a numbered SGPR anyway), and since it is in the SReg_32 register class,

  // TargetInstrInfo::foldMemoryOperand() is going to try.

  // A similar issue also exists with spilling and reloading $exec registers.

  //

  // To prevent that, constrain the %0 register class here.

  if (isFullCopyInstr(MI)) {

    Register DstReg = MI.getOperand(0).getReg();

    Register SrcReg = MI.getOperand(1).getReg();

    if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&

        (DstReg.isVirtual() != SrcReg.isVirtual())) {

      MachineRegisterInfo &MRI = MF.getRegInfo();

      Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;

      const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);

      if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {

        MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);

        return nullptr;

      }

      if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {

        MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);

        return nullptr;

      }

    }

  }


  return nullptr;

}


unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,

                                      const MachineInstr &MI,

                                      unsigned *PredCost) const {

  if (MI.isBundle()) {

    MachineBasicBlock::const_instr_iterator I(MI.getIterator());

    MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());

    unsigned Lat = 0, Count = 0;

    for (++I; I != E && I->isBundledWithPred(); ++I) {

      ++Count;

      Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));

    }

    return Lat + Count - 1;

  }


  return SchedModel.computeInstrLatency(&MI);

}


InstructionUniformity


SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {

  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

  unsigned Opcode = MI.getOpcode();


  auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {

    Register Dst = MI.getOperand(0).getReg();

    Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()

                                       : MI.getOperand(1).getReg();

    LLT DstTy = MRI.getType(Dst);

    LLT SrcTy = MRI.getType(Src);

    unsigned DstAS = DstTy.getAddressSpace();

    unsigned SrcAS = SrcTy.getAddressSpace();

    return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&

                   DstAS == AMDGPUAS::FLAT_ADDRESS &&

                   ST.hasGloballyAddressableScratch()

               ? InstructionUniformity::NeverUniform

               : InstructionUniformity::Default;

  };


  // If the target supports globally addressable scratch, the mapping from

  // scratch memory to the flat aperture changes therefore an address space cast

  // is no longer uniform.

  if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)

    return HandleAddrSpaceCast(MI);


  if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {

    auto IID = GI->getIntrinsicID();

    if (AMDGPU::isIntrinsicSourceOfDivergence(IID))

      return InstructionUniformity::NeverUniform;

    if (AMDGPU::isIntrinsicAlwaysUniform(IID))

      return InstructionUniformity::AlwaysUniform;


    switch (IID) {

    case Intrinsic::amdgcn_addrspacecast_nonnull:

      return HandleAddrSpaceCast(MI);

    case Intrinsic::amdgcn_if:

    case Intrinsic::amdgcn_else:

      // FIXME: Uniform if second result

      break;

    }


    return InstructionUniformity::Default;

  }


  // Loads from the private and flat address spaces are divergent, because

  // threads can execute the load instruction with the same inputs and get

  // different results.

  //

  // All other loads are not divergent, because if threads issue loads with the

  // same arguments, they will always get the same result.

  if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||

      Opcode == AMDGPU::G_SEXTLOAD) {

    if (MI.memoperands_empty())

      return InstructionUniformity::NeverUniform; // conservative assumption


    if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {

          return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||

                 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;

        })) {

      // At least one MMO in a non-global address space.

      return InstructionUniformity::NeverUniform;

    }

    return InstructionUniformity::Default;

  }


  if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||

      Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||

      Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||

      AMDGPU::isGenericAtomic(Opcode)) {

    return InstructionUniformity::NeverUniform;

  }

  return InstructionUniformity::Default;

}


InstructionUniformity


SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {


  if (isNeverUniform(MI))

    return InstructionUniformity::NeverUniform;


  unsigned opcode = MI.getOpcode();

  if (opcode == AMDGPU::V_READLANE_B32 ||

      opcode == AMDGPU::V_READFIRSTLANE_B32 ||

      opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)

    return InstructionUniformity::AlwaysUniform;


  if (isCopyInstr(MI)) {

    const MachineOperand &srcOp = MI.getOperand(1);

    if (srcOp.isReg() && srcOp.getReg().isPhysical()) {

      const TargetRegisterClass *regClass =

          RI.getPhysRegBaseClass(srcOp.getReg());

      return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform

                                      : InstructionUniformity::NeverUniform;

    }

    return InstructionUniformity::Default;

  }


  // GMIR handling

  if (MI.isPreISelOpcode())

    return SIInstrInfo::getGenericInstructionUniformity(MI);


  // Atomics are divergent because they are executed sequentially: when an

  // atomic operation refers to the same address in each thread, then each

  // thread after the first sees the value written by the previous thread as

  // original value.


  if (isAtomic(MI))

    return InstructionUniformity::NeverUniform;


  // Loads from the private and flat address spaces are divergent, because

  // threads can execute the load instruction with the same inputs and get

  // different results.

  if (isFLAT(MI) && MI.mayLoad()) {

    if (MI.memoperands_empty())

      return InstructionUniformity::NeverUniform; // conservative assumption


    if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {

          return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||

                 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;

        })) {

      // At least one MMO in a non-global address space.

      return InstructionUniformity::NeverUniform;

    }


    return InstructionUniformity::Default;

  }


  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

  const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();


  // FIXME: It's conceptually broken to report this for an instruction, and not

  // a specific def operand. For inline asm in particular, there could be mixed

  // uniform and divergent results.

  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {

    const MachineOperand &SrcOp = MI.getOperand(I);

    if (!SrcOp.isReg())

      continue;


    Register Reg = SrcOp.getReg();

    if (!Reg || !SrcOp.readsReg())

      continue;


    // If RegBank is null, this is unassigned or an unallocatable special

    // register, which are all scalars.

    const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);

    if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)

      return InstructionUniformity::NeverUniform;

  }


  // TODO: Uniformity check condtions above can be rearranged for more

  // redability


  // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are

  //       currently turned into no-op COPYs by SelectionDAG ISel and are

  //       therefore no longer recognizable.


  return InstructionUniformity::Default;

}


unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {

  switch (MF.getFunction().getCallingConv()) {

  case CallingConv::AMDGPU_PS:

    return 1;

  case CallingConv::AMDGPU_VS:

    return 2;

  case CallingConv::AMDGPU_GS:

    return 3;

  case CallingConv::AMDGPU_HS:

  case CallingConv::AMDGPU_LS:

  case CallingConv::AMDGPU_ES: {

    const Function &F = MF.getFunction();

    F.getContext().diagnose(DiagnosticInfoUnsupported(

        F, "ds_ordered_count unsupported for this calling conv"));

    [[fallthrough]];

  }

  case CallingConv::AMDGPU_CS:

  case CallingConv::AMDGPU_KERNEL:

  case CallingConv::C:

  case CallingConv::Fast:

  default:

    // Assume other calling conventions are various compute callable functions

    return 0;

  }

}


bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,

                                 Register &SrcReg2, int64_t &CmpMask,

                                 int64_t &CmpValue) const {

  if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())

    return false;


  switch (MI.getOpcode()) {

  default:

    break;

  case AMDGPU::S_CMP_EQ_U32:

  case AMDGPU::S_CMP_EQ_I32:

  case AMDGPU::S_CMP_LG_U32:

  case AMDGPU::S_CMP_LG_I32:

  case AMDGPU::S_CMP_LT_U32:

  case AMDGPU::S_CMP_LT_I32:

  case AMDGPU::S_CMP_GT_U32:

  case AMDGPU::S_CMP_GT_I32:

  case AMDGPU::S_CMP_LE_U32:

  case AMDGPU::S_CMP_LE_I32:

  case AMDGPU::S_CMP_GE_U32:

  case AMDGPU::S_CMP_GE_I32:

  case AMDGPU::S_CMP_EQ_U64:

  case AMDGPU::S_CMP_LG_U64:

    SrcReg = MI.getOperand(0).getReg();

    if (MI.getOperand(1).isReg()) {

      if (MI.getOperand(1).getSubReg())

        return false;

      SrcReg2 = MI.getOperand(1).getReg();

      CmpValue = 0;

    } else if (MI.getOperand(1).isImm()) {

      SrcReg2 = Register();

      CmpValue = MI.getOperand(1).getImm();

    } else {

      return false;

    }

    CmpMask = ~0;

    return true;

  case AMDGPU::S_CMPK_EQ_U32:

  case AMDGPU::S_CMPK_EQ_I32:

  case AMDGPU::S_CMPK_LG_U32:

  case AMDGPU::S_CMPK_LG_I32:

  case AMDGPU::S_CMPK_LT_U32:

  case AMDGPU::S_CMPK_LT_I32:

  case AMDGPU::S_CMPK_GT_U32:

  case AMDGPU::S_CMPK_GT_I32:

  case AMDGPU::S_CMPK_LE_U32:

  case AMDGPU::S_CMPK_LE_I32:

  case AMDGPU::S_CMPK_GE_U32:

  case AMDGPU::S_CMPK_GE_I32:

    SrcReg = MI.getOperand(0).getReg();

    SrcReg2 = Register();

    CmpValue = MI.getOperand(1).getImm();

    CmpMask = ~0;

    return true;

  }


  return false;

}


bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,

                                       Register SrcReg2, int64_t CmpMask,

                                       int64_t CmpValue,

                                       const MachineRegisterInfo *MRI) const {

  if (!SrcReg || SrcReg.isPhysical())

    return false;


  if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))

    return false;


  const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,

                               this](int64_t ExpectedValue, unsigned SrcSize,

                                     bool IsReversible, bool IsSigned) -> bool {

    // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n

    // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n

    // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n

    // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n

    // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n

    // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n

    // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n

    // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n

    // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n

    // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n

    //

    // Signed ge/gt are not used for the sign bit.

    //

    // If result of the AND is unused except in the compare:

    // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n

    //

    // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n

    // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n

    // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n

    // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n

    // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n

    // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n


    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);

    if (!Def || Def->getParent() != CmpInstr.getParent())

      return false;


    if (Def->getOpcode() != AMDGPU::S_AND_B32 &&

        Def->getOpcode() != AMDGPU::S_AND_B64)

      return false;


    int64_t Mask;

    const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {

      if (MO->isImm())

        Mask = MO->getImm();

      else if (!getFoldableImm(MO, Mask))

        return false;

      Mask &= maxUIntN(SrcSize);

      return isPowerOf2_64(Mask);

    };


    MachineOperand *SrcOp = &Def->getOperand(1);

    if (isMask(SrcOp))

      SrcOp = &Def->getOperand(2);

    else if (isMask(&Def->getOperand(2)))

      SrcOp = &Def->getOperand(1);

    else

      return false;


    // A valid Mask is required to have a single bit set, hence a non-zero and

    // power-of-two value. This verifies that we will not do 64-bit shift below.

    assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");

    unsigned BitNo = llvm::countr_zero((uint64_t)Mask);

    if (IsSigned && BitNo == SrcSize - 1)

      return false;


    ExpectedValue <<= BitNo;


    bool IsReversedCC = false;

    if (CmpValue != ExpectedValue) {

      if (!IsReversible)

        return false;

      IsReversedCC = CmpValue == (ExpectedValue ^ Mask);

      if (!IsReversedCC)

        return false;

    }


    Register DefReg = Def->getOperand(0).getReg();

    if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))

      return false;


    for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();

         I != E; ++I) {

      if (I->modifiesRegister(AMDGPU::SCC, &RI) ||

          I->killsRegister(AMDGPU::SCC, &RI))

        return false;

    }


    MachineOperand *SccDef =

        Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);

    SccDef->setIsDead(false);

    CmpInstr.eraseFromParent();


    if (!MRI->use_nodbg_empty(DefReg)) {

      assert(!IsReversedCC);

      return true;

    }


    // Replace AND with unused result with a S_BITCMP.

    MachineBasicBlock *MBB = Def->getParent();


    unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32

                                                     : AMDGPU::S_BITCMP1_B32

                                      : IsReversedCC ? AMDGPU::S_BITCMP0_B64

                                                     : AMDGPU::S_BITCMP1_B64;


    BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))

      .add(*SrcOp)

      .addImm(BitNo);

    Def->eraseFromParent();


    return true;

  };


  switch (CmpInstr.getOpcode()) {

  default:

    break;

  case AMDGPU::S_CMP_EQ_U32:

  case AMDGPU::S_CMP_EQ_I32:

  case AMDGPU::S_CMPK_EQ_U32:

  case AMDGPU::S_CMPK_EQ_I32:

    return optimizeCmpAnd(1, 32, true, false);

  case AMDGPU::S_CMP_GE_U32:

  case AMDGPU::S_CMPK_GE_U32:

    return optimizeCmpAnd(1, 32, false, false);

  case AMDGPU::S_CMP_GE_I32:

  case AMDGPU::S_CMPK_GE_I32:

    return optimizeCmpAnd(1, 32, false, true);

  case AMDGPU::S_CMP_EQ_U64:

    return optimizeCmpAnd(1, 64, true, false);

  case AMDGPU::S_CMP_LG_U32:

  case AMDGPU::S_CMP_LG_I32:

  case AMDGPU::S_CMPK_LG_U32:

  case AMDGPU::S_CMPK_LG_I32:

    return optimizeCmpAnd(0, 32, true, false);

  case AMDGPU::S_CMP_GT_U32:

  case AMDGPU::S_CMPK_GT_U32:

    return optimizeCmpAnd(0, 32, false, false);

  case AMDGPU::S_CMP_GT_I32:

  case AMDGPU::S_CMPK_GT_I32:

    return optimizeCmpAnd(0, 32, false, true);

  case AMDGPU::S_CMP_LG_U64:

    return optimizeCmpAnd(0, 64, true, false);

  }


  return false;

}


void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,

                                            AMDGPU::OpName OpName) const {

  if (!ST.needsAlignedVGPRs())

    return;


  int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);

  if (OpNo < 0)

    return;

  MachineOperand &Op = MI.getOperand(OpNo);

  if (getOpSize(MI, OpNo) > 4)

    return;


  // Add implicit aligned super-reg to force alignment on the data operand.

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *BB = MI.getParent();

  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();

  Register DataReg = Op.getReg();

  bool IsAGPR = RI.isAGPR(MRI, DataReg);

  Register Undef = MRI.createVirtualRegister(

      IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);

  BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);

  Register NewVR =

      MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass

                                       : &AMDGPU::VReg_64_Align2RegClass);

  BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)

      .addReg(DataReg, 0, Op.getSubReg())

      .addImm(AMDGPU::sub0)

      .addReg(Undef)

      .addImm(AMDGPU::sub1);

  Op.setReg(NewVR);

  Op.setSubReg(AMDGPU::sub0);

  MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));

}


bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {

  if (isIGLP(*MI))

    return false;


  return TargetInstrInfo::isGlobalMemoryObject(MI);

}


bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {

  if (!isWMMA(MI) && !isSWMMAC(MI))

    return false;


  if (AMDGPU::isGFX1250(ST))

    return AMDGPU::getWMMAIsXDL(MI.getOpcode());


  return true;

}


bool SIInstrInfo::isXDL(const MachineInstr &MI) const {

  unsigned Opcode = MI.getOpcode();


  if (AMDGPU::isGFX12Plus(ST))

    return isDOT(MI) || isXDLWMMA(MI);


  if (!isMAI(MI) || isDGEMM(Opcode) ||

      Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||

      Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)

    return false;


  if (!ST.hasGFX940Insts())

    return true;


  return AMDGPU::getMAIIsGFX940XDL(Opcode);

}


SubReg
unsigned SubReg
Definition AArch64AdvSIMDScalarPass.cpp:102

MRI
unsigned const MachineRegisterInfo * MRI
Definition AArch64AdvSIMDScalarPass.cpp:103

UseMI
MachineInstrBuilder & UseMI
Definition AArch64ExpandPseudoInsts.cpp:111

DefMI
MachineInstrBuilder MachineInstrBuilder & DefMI
Definition AArch64ExpandPseudoInsts.cpp:112

RegSize
unsigned RegSize
Definition AArch64MIPeepholeOpt.cpp:165

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUBaseInfo.h

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

AMDGPULaneMaskUtils.h

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition ARMSLSHardening.cpp:72

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

CommandLine.h

DiagnosticInfo.h

GCNHazardRecognizer.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

GenericMachineInstrs.h
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

OffsetOp
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
Definition InstCombineCompares.cpp:5837

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3450

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

LiveIntervals.h

LiveVariables.h

MCContext.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

MachineDominators.h

MachineFrameInfo.h

isUndef
static bool isUndef(const MachineInstr &MI)
Definition MachineSSAContext.cpp:57

MachineScheduler.h

RegSubRegPair
TargetInstrInfo::RegSubRegPair RegSubRegPair
Definition MachineSink.cpp:122

Reg
Register Reg
Definition MachineSink.cpp:2117

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2118

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

getReg
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition MipsDisassembler.cpp:106

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

High
uint64_t High
Definition NVVMIntrRange.cpp:46

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

P
#define P(N)

Merge
R600 Clause Merge
Definition R600ClauseMergePass.cpp:70

TBB
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
Definition RISCVRedundantCopyElimination.cpp:72

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:71

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:75

RegisterScavenging.h
This file declares the machine register scavenger class.

Fix16BitCopies
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)

expandSGPRCopy
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
Definition SIInstrInfo.cpp:762

getNewFMAInst
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
Definition SIInstrInfo.cpp:3989

indirectCopyToAGPR
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
Definition SIInstrInfo.cpp:642

getIndirectSGPRWriteMovRelPseudo32
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
Definition SIInstrInfo.cpp:1483

compareMachineOp
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
Definition SIInstrInfo.cpp:4560

isStride64
static bool isStride64(unsigned Opc)
Definition SIInstrInfo.cpp:352

GENERATE_RENAMED_GFX9_CASES
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
Definition SIInstrInfo.cpp:9885

extractRsrcPtr
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
Definition SIInstrInfo.cpp:7033

followSubRegDef
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
Definition SIInstrInfo.cpp:10004

getIndirectSGPRWriteMovRelPseudo64
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
Definition SIInstrInfo.cpp:1512

swapImmOperands
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
Definition SIInstrInfo.cpp:2739

copyFlagsToImplicitVCC
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
Definition SIInstrInfo.cpp:4713

emitLoadScalarOpsFromVGPRLoop
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
Definition SIInstrInfo.cpp:6786

offsetsDoNotOverlap
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
Definition SIInstrInfo.cpp:3856

getWWMRegSpillSaveOpcode
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
Definition SIInstrInfo.cpp:1652

memOpsHaveSameBaseOperands
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
Definition SIInstrInfo.cpp:3845

getWWMRegSpillRestoreOpcode
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
Definition SIInstrInfo.cpp:1844

getFoldableImm
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
Definition SIInstrInfo.cpp:3954

getIndirectVGPRWriteMovRelPseudoOpc
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
Definition SIInstrInfo.cpp:1454

subtargetEncodingFamily
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
Definition SIInstrInfo.cpp:9844

preserveCondRegFlags
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
Definition SIInstrInfo.cpp:3191

findImplicitSGPRRead
static Register findImplicitSGPRRead(const MachineInstr &MI)
Definition SIInstrInfo.cpp:4808

getNewFMAAKInst
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
Definition SIInstrInfo.cpp:3475

BranchOffsetBits
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))

updateLiveVariables
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
Definition SIInstrInfo.cpp:3977

memOpsHaveSameBasePtr
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
Definition SIInstrInfo.cpp:530

getSGPRSpillRestoreOpcode
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
Definition SIInstrInfo.cpp:1737

isRegOrFI
static bool isRegOrFI(const MachineOperand &MO)
Definition SIInstrInfo.cpp:4852

getSGPRSpillSaveOpcode
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
Definition SIInstrInfo.cpp:1545

ModifierOpNames
static constexpr AMDGPU::OpName ModifierOpNames[]
Definition SIInstrInfo.cpp:3437

getVGPRSpillSaveOpcode
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
Definition SIInstrInfo.cpp:1580

reportIllegalCopy
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
Definition SIInstrInfo.cpp:625

swapRegAndNonRegOperand
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
Definition SIInstrInfo.cpp:2710

adjustAllocatableRegClass
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID)
Definition SIInstrInfo.cpp:5957

shouldReadExec
static bool shouldReadExec(const MachineInstr &MI)
Definition SIInstrInfo.cpp:4830

getNewFMAMKInst
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
Definition SIInstrInfo.cpp:3507

isRenamedInGFX9
static bool isRenamedInGFX9(int Opcode)
Definition SIInstrInfo.cpp:9892

getRegOrUndef
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
Definition SIInstrInfo.cpp:9985

changesVGPRIndexingMode
static bool changesVGPRIndexingMode(const MachineInstr &MI)
Definition SIInstrInfo.cpp:4268

isSubRegOf
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
Definition SIInstrInfo.cpp:4856

nodesHaveSameOperandValue
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition SIInstrInfo.cpp:84

getAVSpillSaveOpcode
static unsigned getAVSpillSaveOpcode(unsigned Size)
Definition SIInstrInfo.cpp:1617

getNumOperandsNoGlue
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition SIInstrInfo.cpp:75

canRemat
static bool canRemat(const MachineInstr &MI)
Definition SIInstrInfo.cpp:110

loadMBUFScalarOperandsFromVGPR
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
Definition SIInstrInfo.cpp:6926

getAVSpillRestoreOpcode
static unsigned getAVSpillRestoreOpcode(unsigned Size)
Definition SIInstrInfo.cpp:1809

getVGPRSpillRestoreOpcode
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Definition SIInstrInfo.cpp:1772

SIInstrInfo.h
Interface definition for SIInstrInfo.

IsDead
bool IsDead
Definition SILowerControlFlow.cpp:174

SIMachineFunctionInfo.h

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

contains
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480

ScheduleDAG.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

ValueTracking.h

AMDGPUGenInstrInfo

Node
Definition ItaniumDemangle.h:166

llvm::AMDGPURegisterBankInfo
Definition AMDGPURegisterBankInfo.h:42

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition AMDGPUSubtarget.h:42

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition AMDGPUSubtarget.h:44

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition AMDGPUSubtarget.h:38

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::GFX11
@ GFX11
Definition AMDGPUSubtarget.h:43

llvm::AMDGPU::LaneMaskConstants
Definition AMDGPULaneMaskUtils.h:21

llvm::AMDGPU::LaneMaskConstants::CSelectOpc
const unsigned CSelectOpc
Definition AMDGPULaneMaskUtils.h:34

llvm::AMDGPU::LaneMaskConstants::WQMOpc
const unsigned WQMOpc
Definition AMDGPULaneMaskUtils.h:42

llvm::AMDGPU::LaneMaskConstants::VccReg
const Register VccReg
Definition AMDGPULaneMaskUtils.h:24

llvm::AMDGPU::LaneMaskConstants::AndOpc
const unsigned AndOpc
Definition AMDGPULaneMaskUtils.h:25

llvm::AMDGPU::LaneMaskConstants::MovOpc
const unsigned MovOpc
Definition AMDGPULaneMaskUtils.h:35

llvm::AMDGPU::LaneMaskConstants::get
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Definition AMDGPULaneMaskUtils.h:79

llvm::AMDGPU::LaneMaskConstants::XorTermOpc
const unsigned XorTermOpc
Definition AMDGPULaneMaskUtils.h:41

llvm::AMDGPU::LaneMaskConstants::ExecReg
const Register ExecReg
Definition AMDGPULaneMaskUtils.h:23

llvm::AMDGPU::LaneMaskConstants::OrSaveExecOpc
const unsigned OrSaveExecOpc
Definition AMDGPULaneMaskUtils.h:39

llvm::AMDGPU::LaneMaskConstants::AndSaveExecOpc
const unsigned AndSaveExecOpc
Definition AMDGPULaneMaskUtils.h:30

llvm::APFloat
Definition APFloat.h:900

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1740

llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition SelectionDAGNodes.h:1757

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:124

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1101

llvm::DominatorTreeBase::changeImmediateDominator
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
Definition GenericDomTree.h:723

llvm::DominatorTreeBase::addNewBlock
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
Definition GenericDomTree.h:687

llvm::DominatorTreeBase::properlyDominates
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Definition GenericDomTree.h:443

llvm::Function
Definition Function.h:64

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359

llvm::GCNHazardRecognizer
Definition GCNHazardRecognizer.h:32

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::GCNSubtarget::TrapID::LLVMAMDHSATrap
@ LLVMAMDHSATrap
Definition GCNSubtarget.h:46

llvm::GCNSubtarget::hasAddNoCarry
bool hasAddNoCarry() const
Definition GCNSubtarget.h:779

llvm::GenericCycleInfo::getCycle
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
Definition GenericCycleImpl.h:530

llvm::GenericCycle::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
Definition GenericCycleImpl.h:77

llvm::GenericCycle::contains
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
Definition GenericCycleInfo.h:135

llvm::GenericCycle::getParentCycle
const GenericCycle * getParentCycle() const
Definition GenericCycleInfo.h:143

llvm::InlineAsm::MIOp_FirstOperand
@ MIOp_FirstOperand
Definition InlineAsm.h:214

llvm::InstrItineraryData
Itinerary data supplied by a subtarget to be used by a target.
Definition MCInstrItineraries.h:110

llvm::LLT
Definition LowLevelType.h:40

llvm::LLT::getAddressSpace
constexpr unsigned getAddressSpace() const
Definition LowLevelType.h:271

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LiveInterval
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition LiveInterval.h:690

llvm::LiveIntervals
Definition LiveIntervals.h:55

llvm::LiveIntervals::hasInterval
bool hasInterval(Register Reg) const
Definition LiveIntervals.h:144

llvm::LiveIntervals::getInstructionIndex
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
Definition LiveIntervals.h:247

llvm::LiveIntervals::getInterval
LiveInterval & getInterval(Register Reg)
Definition LiveIntervals.h:133

llvm::LiveIntervals::shrinkToUses
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
Definition LiveIntervals.cpp:485

llvm::LiveIntervals::ReplaceMachineInstrInMaps
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
Definition LiveIntervals.h:299

llvm::LiveRange
This class represents the liveness of a register, stack slot, etc.
Definition LiveInterval.h:158

llvm::LiveVariables
Definition LiveVariables.h:49

llvm::LiveVariables::replaceKillInstruction
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
Definition LiveVariables.cpp:734

llvm::LiveVariables::getVarInfo
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
Definition LiveVariables.cpp:113

llvm::LocationSize
Definition MemoryLocation.h:67

llvm::LocationSize::hasValue
bool hasValue() const
Definition MemoryLocation.h:152

llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition MemoryLocation.h:95

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:157

llvm::MCBinaryExpr::createAnd
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348

llvm::MCBinaryExpr::createAShr
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418

llvm::MCBinaryExpr::createSub
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428

llvm::MCConstantExpr::create
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212

llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition MCInstrDesc.h:199

llvm::MCInstrDesc::getNumOperands
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition MCInstrDesc.h:238

llvm::MCInstrDesc::operands
ArrayRef< MCOperandInfo > operands() const
Definition MCInstrDesc.h:240

llvm::MCInstrDesc::mayStore
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition MCInstrDesc.h:446

llvm::MCInstrDesc::mayLoad
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition MCInstrDesc.h:440

llvm::MCInstrDesc::getNumDefs
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition MCInstrDesc.h:249

llvm::MCInstrDesc::getSize
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
Definition MCInstrDesc.h:607

llvm::MCInstrDesc::implicit_uses
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition MCInstrDesc.h:567

llvm::MCInstrDesc::getOpcode
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition MCInstrDesc.h:231

llvm::MCOperandInfo
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86

llvm::MCOperandInfo::OperandType
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98

llvm::MCOperandInfo::RegClass
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92

llvm::MCRegister
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33

llvm::MCSymbolRefExpr::create
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214

llvm::MCSymbol
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42

llvm::MCSymbol::setVariableValue
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50

llvm::MIBundleBuilder
Helper class for constructing bundles of MachineInstrs.
Definition MachineInstrBuilder.h:579

llvm::MIBundleBuilder::begin
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Definition MachineInstrBuilder.h:617

llvm::MIBundleBuilder::append
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
Definition MachineInstrBuilder.h:652

llvm::MachineBasicBlock
Definition MachineBasicBlock.h:122

llvm::MachineBasicBlock::transferSuccessorsAndUpdatePHIs
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
Definition MachineBasicBlock.cpp:935

llvm::MachineBasicBlock::empty
bool empty() const
Definition MachineBasicBlock.h:349

llvm::MachineBasicBlock::getSymbol
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
Definition MachineBasicBlock.cpp:61

llvm::MachineBasicBlock::rend
reverse_iterator rend()
Definition MachineBasicBlock.h:387

llvm::MachineBasicBlock::insert
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
Definition MachineBasicBlock.cpp:1463

llvm::MachineBasicBlock::getFirstTerminator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Definition MachineBasicBlock.cpp:242

llvm::MachineBasicBlock::addSuccessor
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition MachineBasicBlock.cpp:796

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:377

llvm::MachineBasicBlock::reverse_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Definition MachineBasicBlock.h:343

llvm::MachineBasicBlock::end
iterator end()
Definition MachineBasicBlock.h:379

llvm::MachineBasicBlock::const_instr_iterator
Instructions::const_iterator const_instr_iterator
Definition MachineBasicBlock.h:337

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:323

llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition MachineBasicBlock.h:466

llvm::MachineBasicBlock::splice
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Definition MachineBasicBlock.h:1149

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:341

llvm::MachineBasicBlock::LQR_Dead
@ LQR_Dead
Register is known to be fully dead.
Definition MachineBasicBlock.h:1222

llvm::MachineDominatorTree
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
Definition MachineDominators.h:71

llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition MachineFrameInfo.h:108

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition MachineFunction.cpp:536

llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition MachineFunction.h:778

llvm::MachineFunction::push_back
void push_back(MachineBasicBlock *MBB)
Definition MachineFunction.h:1001

llvm::MachineFunction::getContext
MCContext & getContext() const
Definition MachineFunction.h:719

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:772

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineFunction::iterator
BasicBlockListType::iterator iterator
Definition MachineFunction.h:966

llvm::MachineFunction::begin
iterator begin()
Definition MachineFunction.h:984

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:860

llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
Definition MachineFunction.cpp:499

llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition MachineFunction.h:1003

llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition MachineFunction.h:758

llvm::MachineInstrBuilder
Definition MachineInstrBuilder.h:98

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:160

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition MachineInstrBuilder.h:253

llvm::MachineInstrBuilder::addSym
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
Definition MachineInstrBuilder.h:295

llvm::MachineInstrBuilder::addFrameIndex
const MachineInstrBuilder & addFrameIndex(int Idx) const
Definition MachineInstrBuilder.h:181

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:126

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:175

llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition MachineInstrBuilder.h:242

llvm::MachineInstrBuilder::addUse
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Definition MachineInstrBuilder.h:152

llvm::MachineInstrBuilder::setMIFlags
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Definition MachineInstrBuilder.h:301

llvm::MachineInstrBuilder::copyImplicitOps
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
Definition MachineInstrBuilder.h:356

llvm::MachineInstrBuilder::addMemOperand
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Definition MachineInstrBuilder.h:231

llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition MachineInstrBuilder.h:145

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:72

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:587

llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition MachineInstr.h:1159

llvm::MachineInstr::isCopy
bool isCopy() const
Definition MachineInstr.h:1431

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition MachineInstr.h:359

llvm::MachineInstr::addImplicitDefUseOperands
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
Definition MachineInstr.cpp:90

llvm::MachineInstr::addOperand
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
Definition MachineInstr.cpp:206

llvm::MachineInstr::getNumExplicitOperands
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
Definition MachineInstr.cpp:821

llvm::MachineInstr::implicit_operands
mop_range implicit_operands()
Definition MachineInstr.h:702

llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition MachineInstr.h:1136

llvm::MachineInstr::hasUnmodeledSideEffects
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
Definition MachineInstr.cpp:1642

llvm::MachineInstr::untieRegOperand
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
Definition MachineInstr.h:1984

llvm::MachineInstr::setDesc
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition MachineInstr.cpp:145

llvm::MachineInstr::hasOneMemOperand
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition MachineInstr.h:813

llvm::MachineInstr::explicit_operands
mop_range explicit_operands()
Definition MachineInstr.h:696

llvm::MachineInstr::tieOperands
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
Definition MachineInstr.cpp:1189

llvm::MachineInstr::memoperands_begin
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition MachineInstr.h:798

llvm::MachineInstr::hasOrderedMemoryRef
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
Definition MachineInstr.cpp:1571

llvm::MachineInstr::MIFlag
MIFlag
Definition MachineInstr.h:86

llvm::MachineInstr::getMF
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
Definition MachineInstr.cpp:756

llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition MachineInstr.h:780

llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition MachineInstr.h:1149

llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition MachineInstr.h:511

llvm::MachineInstr::isMoveImmediate
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
Definition MachineInstr.h:1025

llvm::MachineInstr::eraseFromParent
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition MachineInstr.cpp:770

llvm::MachineInstr::removeOperand
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
Definition MachineInstr.cpp:296

llvm::MachineInstr::setPostInstrSymbol
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
Definition MachineInstr.cpp:494

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:595

llvm::MachineInstr::getFlags
uint32_t getFlags() const
Return the MI flags bitvector.
Definition MachineInstr.h:404

llvm::MachineInstr::findRegisterDefOperandIdx
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
Definition MachineInstr.cpp:1130

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition MachineMemOperand.h:137

llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition MachineMemOperand.h:139

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:48

llvm::MachineOperand::setSubReg
void setSubReg(unsigned subReg)
Definition MachineOperand.h:489

llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition MachineOperand.h:373

llvm::MachineOperand::getOperandNo
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
Definition MachineOperand.cpp:55

llvm::MachineOperand::getGlobal
const GlobalValue * getGlobal() const
Definition MachineOperand.h:582

llvm::MachineOperand::setImplicit
void setImplicit(bool Val=true)
Definition MachineOperand.h:514

llvm::MachineOperand::isUndef
bool isUndef() const
Definition MachineOperand.h:403

llvm::MachineOperand::ChangeToFrameIndex
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
Definition MachineOperand.cpp:232

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition MachineOperand.h:685

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:556

llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition MachineOperand.h:388

llvm::MachineOperand::isKill
bool isKill() const
Definition MachineOperand.h:398

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:328

llvm::MachineOperand::setIsDead
void setIsDead(bool Val=true)
Definition MachineOperand.h:525

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition MachineOperand.cpp:60

llvm::MachineOperand::isUse
bool isUse() const
Definition MachineOperand.h:378

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition MachineOperand.h:330

llvm::MachineOperand::ChangeToImmediate
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
Definition MachineOperand.cpp:161

llvm::MachineOperand::ChangeToGA
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
Definition MachineOperand.cpp:195

llvm::MachineOperand::setIsKill
void setIsKill(bool Val=true)
Definition MachineOperand.h:519

llvm::MachineOperand::ChangeToRegister
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Definition MachineOperand.cpp:272

llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition MachineOperand.h:243

llvm::MachineOperand::setOffset
void setOffset(int64_t Offset)
Definition MachineOperand.h:700

llvm::MachineOperand::isDebug
bool isDebug() const
Definition MachineOperand.h:454

llvm::MachineOperand::getIndex
int getIndex() const
Definition MachineOperand.h:576

llvm::MachineOperand::isDead
bool isDead() const
Definition MachineOperand.h:393

llvm::MachineOperand::getTargetFlags
unsigned getTargetFlags() const
Definition MachineOperand.h:226

llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition MachineOperand.h:821

llvm::MachineOperand::isGlobal
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
Definition MachineOperand.h:346

llvm::MachineOperand::getType
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
Definition MachineOperand.h:224

llvm::MachineOperand::setIsUndef
void setIsUndef(bool Val=true)
Definition MachineOperand.h:530

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:368

llvm::MachineOperand::isTargetIndex
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
Definition MachineOperand.h:342

llvm::MachineOperand::setTargetFlags
void setTargetFlags(unsigned F)
Definition MachineOperand.h:229

llvm::MachineOperand::isFI
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
Definition MachineOperand.h:338

llvm::MachineOperand::isIdenticalTo
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Definition MachineOperand.cpp:318

llvm::MachineOperand::MO_Immediate
@ MO_Immediate
Immediate operand.
Definition MachineOperand.h:52

llvm::MachineOperand::MO_Register
@ MO_Register
Register operand.
Definition MachineOperand.h:51

llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition MachineOperand.h:839

llvm::MachineOperand::getOffset
int64_t getOffset() const
Return the offset from the symbol in this operand.
Definition MachineOperand.h:629

llvm::MachineOperand::isFPImm
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
Definition MachineOperand.h:334

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::RegScavenger
Definition RegisterScavenging.h:34

llvm::RegScavenger::enterBasicBlockEnd
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
Definition RegisterScavenging.cpp:75

llvm::RegScavenger::isRegUsed
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
Definition RegisterScavenging.cpp:94

llvm::RegScavenger::setRegUsed
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
Definition RegisterScavenging.cpp:50

llvm::RegScavenger::backward
void backward()
Update internal register state and move MBB iterator backwards.
Definition RegisterScavenging.cpp:81

llvm::RegScavenger::enterBasicBlock
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Definition RegisterScavenging.cpp:69

llvm::RegScavenger::scavengeRegisterBackwards
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
Definition RegisterScavenging.cpp:295

llvm::RegisterBankInfo::getRegBank
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition RegisterBankInfo.h:440

llvm::RegisterBank
This class implements the register bank concept.
Definition RegisterBank.h:29

llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition RegisterBank.h:46

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:19

llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102

llvm::Register::isValid
constexpr bool isValid() const
Definition Register.h:107

llvm::Register::isVirtual
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74

llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:501

llvm::SDNode::isMachineOpcode
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
Definition SelectionDAGNodes.h:745

llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1783

llvm::SDNode::getMachineOpcode
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
Definition SelectionDAGNodes.h:750

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1034

llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1779

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SIInstrInfo
Definition SIInstrInfo.h:90

llvm::SIInstrInfo::isLegalMUBUFImmOffset
bool isLegalMUBUFImmOffset(unsigned Imm) const
Definition SIInstrInfo.cpp:9661

llvm::SIInstrInfo::isInlineConstant
bool isInlineConstant(const APInt &Imm) const
Definition SIInstrInfo.cpp:4424

llvm::SIInstrInfo::legalizeOperandsVOP3
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
Definition SIInstrInfo.cpp:6452

llvm::SIInstrInfo::isDS
static bool isDS(const MachineInstr &MI)
Definition SIInstrInfo.h:590

llvm::SIInstrInfo::legalizeOperands
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
Definition SIInstrInfo.cpp:7075

llvm::SIInstrInfo::areLoadsFromSameBasePtr
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
Definition SIInstrInfo.cpp:238

llvm::SIInstrInfo::getLiveRangeSplitOpcode
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
Definition SIInstrInfo.cpp:9566

llvm::SIInstrInfo::getMemOperandsWithOffsetWidth
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Definition SIInstrInfo.cpp:364

llvm::SIInstrInfo::isSGPRStackAccess
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
Definition SIInstrInfo.cpp:9343

llvm::SIInstrInfo::getInstSizeInBytes
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
Definition SIInstrInfo.cpp:9391

llvm::SIInstrInfo::isNeverUniform
static bool isNeverUniform(const MachineInstr &MI)
Definition SIInstrInfo.h:989

llvm::SIInstrInfo::getOpSize
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition SIInstrInfo.h:1267

llvm::SIInstrInfo::isXDLWMMA
bool isXDLWMMA(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:10680

llvm::SIInstrInfo::isBasicBlockPrologue
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
Definition SIInstrInfo.cpp:9576

llvm::SIInstrInfo::getDefaultRsrcDataFormat
uint64_t getDefaultRsrcDataFormat() const
Definition SIInstrInfo.cpp:9270

llvm::SIInstrInfo::isSOPP
static bool isSOPP(const MachineInstr &MI)
Definition SIInstrInfo.h:508

llvm::SIInstrInfo::getGenericInstructionUniformity
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:10244

llvm::SIInstrInfo::isIGLP
bool isIGLP(unsigned Opcode) const
Definition SIInstrInfo.h:1030

llvm::SIInstrInfo::isFLATScratch
static bool isFLATScratch(const MachineInstr &MI)
Definition SIInstrInfo.h:672

llvm::SIInstrInfo::getIndirectRegWriteMovRelPseudo
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
Definition SIInstrInfo.cpp:1528

llvm::SIInstrInfo::getAddNoCarry
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
Definition SIInstrInfo.cpp:9602

llvm::SIInstrInfo::mayAccessFlatAddressSpace
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:9477

llvm::SIInstrInfo::shouldScheduleLoadsNear
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
Definition SIInstrInfo.cpp:613

llvm::SIInstrInfo::splitMUBUFOffset
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
Definition SIInstrInfo.cpp:9705

llvm::SIInstrInfo::getSerializableDirectMachineOperandTargetFlags
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Definition SIInstrInfo.cpp:9537

llvm::SIInstrInfo::moveToVALU
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
Definition SIInstrInfo.cpp:7535

llvm::SIInstrInfo::isSMRD
static bool isSMRD(const MachineInstr &MI)
Definition SIInstrInfo.h:580

llvm::SIInstrInfo::restoreExec
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
Definition SIInstrInfo.cpp:5932

llvm::SIInstrInfo::usesConstantBus
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
Definition SIInstrInfo.cpp:4796

llvm::SIInstrInfo::getMaxMUBUFImmOffset
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Definition SIInstrInfo.cpp:9665

llvm::SIInstrInfo::isStoreToStackSlot
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
Definition SIInstrInfo.cpp:9365

llvm::SIInstrInfo::mayAccessScratchThroughFlat
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:4317

llvm::SIInstrInfo::legalizeOperandsFLAT
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
Definition SIInstrInfo.cpp:6720

llvm::SIInstrInfo::optimizeCompareInstr
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Definition SIInstrInfo.cpp:10488

llvm::SIInstrInfo::extractSubregFromImm
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Definition SIInstrInfo.cpp:3451

llvm::SIInstrInfo::isStackAccess
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
Definition SIInstrInfo.cpp:9330

llvm::SIInstrInfo::isMTBUF
static bool isMTBUF(const MachineInstr &MI)
Definition SIInstrInfo.h:572

llvm::SIInstrInfo::getIndirectGPRIDXPseudo
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
Definition SIInstrInfo.cpp:1395

llvm::SIInstrInfo::insertReturn
void insertReturn(MachineBasicBlock &MBB) const
Definition SIInstrInfo.cpp:1942

llvm::SIInstrInfo::isDGEMM
static bool isDGEMM(unsigned Opcode)
Definition SIInstrInfo.h:895

llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition SIInstrInfo.h:705

llvm::SIInstrInfo::isSALU
static bool isSALU(const MachineInstr &MI)
Definition SIInstrInfo.h:444

llvm::SIInstrInfo::legalizeGenericOperand
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
Definition SIInstrInfo.cpp:6741

llvm::SIInstrInfo::buildShrunkInst
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
Definition SIInstrInfo.cpp:4726

llvm::SIInstrInfo::getInstBundleSize
unsigned getInstBundleSize(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:9379

llvm::SIInstrInfo::isVOP2
static bool isVOP2(const MachineInstr &MI)
Definition SIInstrInfo.h:532

llvm::SIInstrInfo::analyzeBranch
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
Definition SIInstrInfo.cpp:3126

llvm::SIInstrInfo::isSDWA
static bool isSDWA(const MachineInstr &MI)
Definition SIInstrInfo.h:548

llvm::SIInstrInfo::getKillTerminatorFromPseudo
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
Definition SIInstrInfo.cpp:9650

llvm::SIInstrInfo::insertNoops
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
Definition SIInstrInfo.cpp:1931

llvm::SIInstrInfo::isGather4
static bool isGather4(const MachineInstr &MI)
Definition SIInstrInfo.h:640

llvm::SIInstrInfo::getWholeWaveFunctionSetup
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
Definition SIInstrInfo.cpp:5944

llvm::SIInstrInfo::isLegalVSrcOperand
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
Definition SIInstrInfo.cpp:6168

llvm::SIInstrInfo::isDOT
static bool isDOT(const MachineInstr &MI)
Definition SIInstrInfo.h:863

llvm::SIInstrInfo::createPHISourceCopy
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Definition SIInstrInfo.cpp:10168

llvm::SIInstrInfo::hasModifiers
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
Definition SIInstrInfo.cpp:4632

llvm::SIInstrInfo::shouldClusterMemOps
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Definition SIInstrInfo.cpp:561

llvm::SIInstrInfo::isSWMMAC
static bool isSWMMAC(const MachineInstr &MI)
Definition SIInstrInfo.h:879

llvm::SIInstrInfo::CreateTargetMIHazardRecognizer
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
Definition SIInstrInfo.cpp:9520

llvm::SIInstrInfo::isWave32
bool isWave32() const
Definition SIInstrInfo.cpp:10186

llvm::SIInstrInfo::isHighLatencyDef
bool isHighLatencyDef(int Opc) const override
Definition SIInstrInfo.cpp:9325

llvm::SIInstrInfo::legalizeOpWithMove
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
Definition SIInstrInfo.cpp:6019

llvm::SIInstrInfo::reverseBranchCondition
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Definition SIInstrInfo.cpp:3249

llvm::SIInstrInfo::isVOPC
static bool isVOPC(const MachineInstr &MI)
Definition SIInstrInfo.h:556

llvm::SIInstrInfo::removeModOperands
void removeModOperands(MachineInstr &MI) const
Definition SIInstrInfo.cpp:3442

llvm::SIInstrInfo::splitFlatOffset
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
Definition SIInstrInfo.cpp:9805

llvm::SIInstrInfo::isSpill
bool isSpill(uint16_t Opcode) const
Definition SIInstrInfo.h:796

llvm::SIInstrInfo::getVectorRegSpillRestoreOpcode
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
Definition SIInstrInfo.cpp:1856

llvm::SIInstrInfo::isXDL
bool isXDL(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:10690

llvm::SIInstrInfo::isVIMAGE
static bool isVIMAGE(const MachineInstr &MI)
Definition SIInstrInfo.h:624

llvm::SIInstrInfo::enforceOperandRCAlignment
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
Definition SIInstrInfo.cpp:10639

llvm::SIInstrInfo::isSOP2
static bool isSOP2(const MachineInstr &MI)
Definition SIInstrInfo.h:484

llvm::SIInstrInfo::isGWS
static bool isGWS(const MachineInstr &MI)
Definition SIInstrInfo.h:606

llvm::SIInstrInfo::isLegalAV64PseudoImm
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
Definition SIInstrInfo.cpp:4614

llvm::SIInstrInfo::hasModifiersSet
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
Definition SIInstrInfo.cpp:4639

llvm::SIInstrInfo::getPreferredSelectRegClass
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
Definition SIInstrInfo.cpp:1187

llvm::SIInstrInfo::isLegalToSwap
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
Definition SIInstrInfo.cpp:2752

llvm::SIInstrInfo::isReallyTriviallyReMaterializable
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
Definition SIInstrInfo.cpp:127

llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition SIInstrInfo.h:664

llvm::SIInstrInfo::isGlobalMemoryObject
bool isGlobalMemoryObject(const MachineInstr *MI) const override
Definition SIInstrInfo.cpp:10673

llvm::SIInstrInfo::isVSAMPLE
static bool isVSAMPLE(const MachineInstr &MI)
Definition SIInstrInfo.h:632

llvm::SIInstrInfo::isBufferSMRD
bool isBufferSMRD(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:9685

llvm::SIInstrInfo::isKillTerminator
static bool isKillTerminator(unsigned Opcode)
Definition SIInstrInfo.cpp:9640

llvm::SIInstrInfo::findCommutedOpIndices
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
Definition SIInstrInfo.cpp:2852

llvm::SIInstrInfo::storeRegToStackSlot
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
Definition SIInstrInfo.cpp:1680

llvm::SIInstrInfo::insertScratchExecCopy
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
Definition SIInstrInfo.cpp:5902

llvm::SIInstrInfo::hasVALU32BitEncoding
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
Definition SIInstrInfo.cpp:4620

llvm::SIInstrInfo::getMovOpcode
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Definition SIInstrInfo.cpp:1376

llvm::SIInstrInfo::reMaterialize
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
Definition SIInstrInfo.cpp:2532

llvm::SIInstrInfo::buildExtractSubReg
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
Definition SIInstrInfo.cpp:6042

llvm::SIInstrInfo::legalizeOperandsVOP2
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
Definition SIInstrInfo.cpp:6334

llvm::SIInstrInfo::foldImmediate
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
Definition SIInstrInfo.cpp:3539

llvm::SIInstrInfo::isImage
static bool isImage(const MachineInstr &MI)
Definition SIInstrInfo.h:460

llvm::SIInstrInfo::isSOPK
static bool isSOPK(const MachineInstr &MI)
Definition SIInstrInfo.h:500

llvm::SIInstrInfo::getOpRegClass
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
Definition SIInstrInfo.cpp:6000

llvm::SIInstrInfo::insertSimulatedTrap
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
Definition SIInstrInfo.cpp:1960

llvm::SIInstrInfo::getNonSoftWaitcntOpcode
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition SIInstrInfo.h:1043

llvm::SIInstrInfo::getDSShaderTypeValue
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
Definition SIInstrInfo.cpp:10403

llvm::SIInstrInfo::isFoldableCopy
static bool isFoldableCopy(const MachineInstr &MI)
Definition SIInstrInfo.cpp:3412

llvm::SIInstrInfo::isIgnorableUse
bool isIgnorableUse(const MachineOperand &MO) const override
Definition SIInstrInfo.cpp:192

llvm::SIInstrInfo::isMUBUF
static bool isMUBUF(const MachineInstr &MI)
Definition SIInstrInfo.h:564

llvm::SIInstrInfo::expandPostRAPseudo
bool expandPostRAPseudo(MachineInstr &MI) const override
Definition SIInstrInfo.cpp:2032

llvm::SIInstrInfo::analyzeCompare
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
Definition SIInstrInfo.cpp:10429

llvm::SIInstrInfo::getRegClass
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Definition SIInstrInfo.cpp:5988

llvm::SIInstrInfo::getInstructionUniformity
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
Definition SIInstrInfo.cpp:10319

llvm::SIInstrInfo::isSegmentSpecificFLAT
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition SIInstrInfo.h:654

llvm::SIInstrInfo::isVOP3
static bool isVOP3(const MCInstrDesc &Desc)
Definition SIInstrInfo.h:540

llvm::SIInstrInfo::physRegUsesConstantBus
bool physRegUsesConstantBus(const MachineOperand &Reg) const
Definition SIInstrInfo.cpp:4771

llvm::SIInstrInfo::isF16PseudoScalarTrans
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition SIInstrInfo.h:1014

llvm::SIInstrInfo::insertSelect
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
Definition SIInstrInfo.cpp:3305

llvm::SIInstrInfo::isDPP
static bool isDPP(const MachineInstr &MI)
Definition SIInstrInfo.h:818

llvm::SIInstrInfo::analyzeBranchImpl
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
Definition SIInstrInfo.cpp:3089

llvm::SIInstrInfo::isLowLatencyInstruction
bool isLowLatencyInstruction(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:9319

llvm::SIInstrInfo::isCopyInstrImpl
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
Definition SIInstrInfo.cpp:2683

llvm::SIInstrInfo::isAlwaysGDS
bool isAlwaysGDS(uint16_t Opcode) const
Definition SIInstrInfo.cpp:4311

llvm::SIInstrInfo::isMAI
static bool isMAI(const MCInstrDesc &Desc)
Definition SIInstrInfo.h:850

llvm::SIInstrInfo::isLoadFromStackSlot
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
Definition SIInstrInfo.cpp:9351

llvm::SIInstrInfo::legalizeOperandsVALUt16
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
Definition SIInstrInfo.cpp:7529

llvm::SIInstrInfo::moveToVALUImpl
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
Definition SIInstrInfo.cpp:7556

llvm::SIInstrInfo::isImmOperandLegal
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
Definition SIInstrInfo.cpp:4603

llvm::SIInstrInfo::canShrink
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition SIInstrInfo.cpp:4650

llvm::SIInstrInfo::isAsmOnlyOpcode
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
Definition SIInstrInfo.cpp:9865

llvm::SIInstrInfo::isVGPRSpill
static bool isVGPRSpill(const MachineInstr &MI)
Definition SIInstrInfo.h:772

llvm::SIInstrInfo::CreateTargetPostRAHazardRecognizer
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
Definition SIInstrInfo.cpp:9505

llvm::SIInstrInfo::verifyInstruction
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
Definition SIInstrInfo.cpp:4880

llvm::SIInstrInfo::isLegalFLATOffset
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
Definition SIInstrInfo.cpp:9781

llvm::SIInstrInfo::isWWMRegSpillOpcode
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition SIInstrInfo.h:806

llvm::SIInstrInfo::getInstrLatency
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
Definition SIInstrInfo.cpp:10226

llvm::SIInstrInfo::foldMemoryOperandImpl
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Definition SIInstrInfo.cpp:10188

llvm::SIInstrInfo::getNamedImmOperand
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
Definition SIInstrInfo.h:1419

llvm::SIInstrInfo::getSerializableTargetIndices
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
Definition SIInstrInfo.cpp:9492

llvm::SIInstrInfo::regUsesConstantBus
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
Definition SIInstrInfo.cpp:4789

llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition SIInstrInfo.h:616

llvm::SIInstrInfo::buildExtractSubRegOrImm
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
Definition SIInstrInfo.cpp:6059

llvm::SIInstrInfo::isSchedulingBoundary
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Definition SIInstrInfo.cpp:4279

llvm::SIInstrInfo::loadRegFromStackSlot
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
Definition SIInstrInfo.cpp:1873

llvm::SIInstrInfo::isLegalRegOperand
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
Definition SIInstrInfo.cpp:6085

llvm::SIInstrInfo::allowNegativeFlatOffset
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
Definition SIInstrInfo.cpp:9836

llvm::SIInstrInfo::getNumWaitStates
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
Definition SIInstrInfo.cpp:2018

llvm::SIInstrInfo::getVectorRegSpillSaveOpcode
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
Definition SIInstrInfo.cpp:1664

llvm::SIInstrInfo::getVALUOp
unsigned getVALUOp(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:5677

llvm::SIInstrInfo::modifiesModeRegister
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Definition SIInstrInfo.cpp:4346

llvm::SIInstrInfo::readlaneVGPRToSGPR
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
Definition SIInstrInfo.cpp:6574

llvm::SIInstrInfo::hasDivergentBranch
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
Definition SIInstrInfo.cpp:2897

llvm::SIInstrInfo::removeBranch
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Definition SIInstrInfo.cpp:3171

llvm::SIInstrInfo::fixImplicitOperands
void fixImplicitOperands(MachineInstr &MI) const
Definition SIInstrInfo.cpp:9672

llvm::SIInstrInfo::moveFlatAddrToVGPR
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Definition SIInstrInfo.cpp:6639

llvm::SIInstrInfo::copyPhysReg
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
Definition SIInstrInfo.cpp:814

llvm::SIInstrInfo::swapSourceModifiers
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Definition SIInstrInfo.cpp:2690

llvm::SIInstrInfo::insertNE
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
Definition SIInstrInfo.cpp:1314

llvm::SIInstrInfo::getBranchDestBlock
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
Definition SIInstrInfo.cpp:2893

llvm::SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
Definition SIInstrInfo.cpp:4353

llvm::SIInstrInfo::getConstValDefinedInReg
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
Definition SIInstrInfo.cpp:1327

llvm::SIInstrInfo::isAtomic
static bool isAtomic(const MachineInstr &MI)
Definition SIInstrInfo.h:737

llvm::SIInstrInfo::canInsertSelect
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
Definition SIInstrInfo.cpp:3263

llvm::SIInstrInfo::isLiteralOperandLegal
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
Definition SIInstrInfo.cpp:4575

llvm::SIInstrInfo::sopkIsZext
static bool sopkIsZext(unsigned Opcode)
Definition SIInstrInfo.h:927

llvm::SIInstrInfo::isSGPRSpill
static bool isSGPRSpill(const MachineInstr &MI)
Definition SIInstrInfo.h:784

llvm::SIInstrInfo::isWMMA
static bool isWMMA(const MachineInstr &MI)
Definition SIInstrInfo.h:867

llvm::SIInstrInfo::getSerializableMachineMemOperandTargetFlags
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
Definition SIInstrInfo.cpp:9555

llvm::SIInstrInfo::convertToThreeAddress
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
Definition SIInstrInfo.cpp:4022

llvm::SIInstrInfo::mayReadEXEC
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
Definition SIInstrInfo.cpp:4399

llvm::SIInstrInfo::legalizeOperandsSMRD
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
Definition SIInstrInfo.cpp:6620

llvm::SIInstrInfo::isBranchOffsetInRange
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
Definition SIInstrInfo.cpp:2876

llvm::SIInstrInfo::insertBranch
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Definition SIInstrInfo.cpp:3197

llvm::SIInstrInfo::insertVectorSelect
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
Definition SIInstrInfo.cpp:1191

llvm::SIInstrInfo::insertNoop
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
Definition SIInstrInfo.cpp:1926

llvm::SIInstrInfo::expandMovDPP64
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Definition SIInstrInfo.cpp:2619

llvm::SIInstrInfo::insertEQ
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
Definition SIInstrInfo.cpp:1301

llvm::SIInstrInfo::isSOPC
static bool isSOPC(const MachineInstr &MI)
Definition SIInstrInfo.h:492

llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition SIInstrInfo.h:648

llvm::SIInstrInfo::isVALU
static bool isVALU(const MachineInstr &MI)
Definition SIInstrInfo.h:452

llvm::SIInstrInfo::MO_GOTPCREL32_LO
@ MO_GOTPCREL32_LO
Definition SIInstrInfo.h:218

llvm::SIInstrInfo::MO_MASK
@ MO_MASK
Definition SIInstrInfo.h:211

llvm::SIInstrInfo::MO_GOTPCREL64
@ MO_GOTPCREL64
Definition SIInstrInfo.h:222

llvm::SIInstrInfo::MO_ABS32_HI
@ MO_ABS32_HI
Definition SIInstrInfo.h:233

llvm::SIInstrInfo::MO_FAR_BRANCH_OFFSET
@ MO_FAR_BRANCH_OFFSET
Definition SIInstrInfo.h:230

llvm::SIInstrInfo::MO_REL32_HI
@ MO_REL32_HI
Definition SIInstrInfo.h:227

llvm::SIInstrInfo::MO_REL32_LO
@ MO_REL32_LO
Definition SIInstrInfo.h:225

llvm::SIInstrInfo::MO_REL64
@ MO_REL64
Definition SIInstrInfo.h:228

llvm::SIInstrInfo::MO_GOTPCREL32_HI
@ MO_GOTPCREL32_HI
Definition SIInstrInfo.h:220

llvm::SIInstrInfo::MO_GOTPCREL
@ MO_GOTPCREL
Definition SIInstrInfo.h:215

llvm::SIInstrInfo::MO_ABS64
@ MO_ABS64
Definition SIInstrInfo.h:234

llvm::SIInstrInfo::MO_ABS32_LO
@ MO_ABS32_LO
Definition SIInstrInfo.h:232

llvm::SIInstrInfo::isBarrier
bool isBarrier(unsigned Opcode) const
Definition SIInstrInfo.h:1004

llvm::SIInstrInfo::commuteInstructionImpl
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
Definition SIInstrInfo.cpp:2796

llvm::SIInstrInfo::pseudoToMCOpcode
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
Definition SIInstrInfo.cpp:9918

llvm::SIInstrInfo::getMCOpcodeFromPseudo
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition SIInstrInfo.h:1433

llvm::SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
Definition SIInstrInfo.cpp:6179

llvm::SIInstrInfo::createPHIDestinationCopy
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
Definition SIInstrInfo.cpp:10153

llvm::SIInstrInfo::isFixedSize
static bool isFixedSize(const MachineInstr &MI)
Definition SIInstrInfo.h:945

llvm::SIInstrInfo::isSafeToSink
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
Definition SIInstrInfo.cpp:198

llvm::SIInstrInfo::commuteOpcode
LLVM_READONLY int commuteOpcode(unsigned Opc) const
Definition SIInstrInfo.cpp:1168

llvm::SIInstrInfo::getScratchRsrcWords23
uint64_t getScratchRsrcWords23() const
Definition SIInstrInfo.cpp:9295

llvm::SIInstrInfo::getNamedOperand
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
Definition SIInstrInfo.cpp:9258

llvm::SIInstrInfo::decomposeMachineOperandsTargetFlags
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
Definition SIInstrInfo.cpp:9532

llvm::SIInstrInfo::areMemAccessesTriviallyDisjoint
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
Definition SIInstrInfo.cpp:3890

llvm::SIInstrInfo::isOperandLegal
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
Definition SIInstrInfo.cpp:6212

llvm::SIInstrInfo::isLDSDMA
static bool isLDSDMA(const MachineInstr &MI)
Definition SIInstrInfo.h:598

llvm::SIInstrInfo::isVOP1
static bool isVOP1(const MachineInstr &MI)
Definition SIInstrInfo.h:524

llvm::SIInstrInfo::SIInstrInfo
SIInstrInfo(const GCNSubtarget &ST)
Definition SIInstrInfo.cpp:65

llvm::SIInstrInfo::insertIndirectBranch
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
Definition SIInstrInfo.cpp:2906

llvm::SIInstrInfo::hasAnyModifiersSet
bool hasAnyModifiersSet(const MachineInstr &MI) const
Definition SIInstrInfo.cpp:4645

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:412

llvm::SIMachineFunctionInfo::getLongBranchReservedReg
Register getLongBranchReservedReg() const
Definition SIMachineFunctionInfo.h:1051

llvm::SIMachineFunctionInfo::isWholeWaveFunction
bool isWholeWaveFunction() const
Definition SIMachineFunctionInfo.h:687

llvm::SIMachineFunctionInfo::getStackPtrOffsetReg
Register getStackPtrOffsetReg() const
Definition SIMachineFunctionInfo.h:1047

llvm::SIMachineFunctionInfo::getMaxMemoryClusterDWords
unsigned getMaxMemoryClusterDWords() const
Definition SIMachineFunctionInfo.h:1197

llvm::SIMachineFunctionInfo::setHasSpilledVGPRs
void setHasSpilledVGPRs(bool Spill=true)
Definition SIMachineFunctionInfo.h:1073

llvm::SIMachineFunctionInfo::isWWMReg
bool isWWMReg(Register Reg) const
Definition SIMachineFunctionInfo.h:658

llvm::SIMachineFunctionInfo::checkFlag
bool checkFlag(Register Reg, uint8_t Flag) const
Definition SIMachineFunctionInfo.h:775

llvm::SIMachineFunctionInfo::setHasSpilledSGPRs
void setHasSpilledSGPRs(bool Spill=true)
Definition SIMachineFunctionInfo.h:1065

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SIRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned RCID) const
Definition SIRegisterInfo.cpp:3900

llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition SIRegisterInfo.cpp:552

llvm::SIRegisterInfo::getProperlyAlignedRC
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
Definition SIRegisterInfo.cpp:4002

llvm::SIRegisterInfo::getRegSplitParts
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
Definition SIRegisterInfo.cpp:3675

llvm::SIRegisterInfo::getHWRegIndex
unsigned getHWRegIndex(MCRegister Reg) const
Definition SIRegisterInfo.cpp:3276

llvm::SIRegisterInfo::getRegPressureLimit
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
Definition SIRegisterInfo.cpp:3739

llvm::SIRegisterInfo::getChannelFromSubReg
unsigned getChannelFromSubReg(unsigned SubReg) const
Definition SIRegisterInfo.h:419

llvm::SIRegisterInfo::isAGPRClass
static bool isAGPRClass(const TargetRegisterClass *RC)
Definition SIRegisterInfo.h:246

llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition MachineScheduler.h:308

llvm::ScheduleDAGMI::hasVRegLiveness
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
Definition MachineScheduler.h:349

llvm::ScheduleDAG
Definition ScheduleDAG.h:581

llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition ScheduleDAG.h:586

llvm::ScheduleHazardRecognizer
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
Definition ScheduleHazardRecognizer.h:25

llvm::SlotIndex
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66

llvm::SlotIndex::getRegSlot
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition SlotIndexes.h:238

llvm::SlotIndexes
SlotIndexes pass.
Definition SlotIndexes.h:298

llvm::SlotIndexes::insertMachineInstrInMaps
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition SlotIndexes.h:532

llvm::SmallDenseSet
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:574

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:414

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1197

llvm::SparseBitVector::clear
void clear()
Definition SparseBitVector.h:452

llvm::SrcOp
Definition MachineIRBuilder.h:143

llvm::SrcOp::getImm
int64_t getImm() const
Definition MachineIRBuilder.h:217

llvm::SrcOp::getReg
Register getReg() const
Definition MachineIRBuilder.h:195

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::TargetInstrInfo::CreateTargetMIHazardRecognizer
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
Definition TargetInstrInfo.cpp:1729

llvm::TargetInstrInfo::createPHIDestinationCopy
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
Definition TargetInstrInfo.h:2134

llvm::TargetInstrInfo::reMaterialize
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
Definition TargetInstrInfo.cpp:451

llvm::TargetInstrInfo::createPHISourceCopy
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
Definition TargetInstrInfo.h:2144

llvm::TargetInstrInfo::isReallyTriviallyReMaterializable
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
Definition TargetInstrInfo.cpp:1590

llvm::TargetInstrInfo::commuteInstructionImpl
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
Definition TargetInstrInfo.cpp:181

llvm::TargetInstrInfo::isGlobalMemoryObject
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
Definition TargetInstrInfo.cpp:2236

llvm::TargetInstrInfo::expandPostRAPseudo
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
Definition TargetInstrInfo.h:1195

llvm::TargetMachine::getMCAsmInfo
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
Definition TargetMachine.h:240

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetRegisterClass::contains
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
Definition TargetRegisterInfo.h:95

llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition TargetRegisterInfo.h:143

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:237

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::VirtRegMap
Definition VirtRegMap.h:35

llvm::cl::opt
Definition CommandLine.h:1429

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194

llvm::detail::DenseSetImpl::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:134

uint16_t

uint32_t

uint64_t

uint8_t

Changed
Changed
Definition ObjCARCOpts.cpp:2370

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

TargetMachine.h

OpName
Definition R600Defines.h:62

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:30

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:31

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPU::CPol::SCAL
@ SCAL
Definition SIDefines.h:413

llvm::AMDGPU::DPP
Definition SIDefines.h:948

llvm::AMDGPU::DepCtr::encodeFieldSaSdst
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
Definition AMDGPUBaseInfo.cpp:2071

llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition SIDefines.h:928

llvm::AMDGPU::SDWA::WORD_1
@ WORD_1
Definition SIDefines.h:921

llvm::AMDGPU::SDWA::DWORD
@ DWORD
Definition SIDefines.h:922

llvm::AMDGPU::SDWA::WORD_0
@ WORD_0
Definition SIDefines.h:920

llvm::AMDGPU::SendMsg::ID_INTERRUPT
@ ID_INTERRUPT
Definition SIDefines.h:433

llvm::AMDGPU::SendMsg::ID_RTN_GET_DOORBELL
@ ID_RTN_GET_DOORBELL
Definition SIDefines.h:452

llvm::AMDGPU::UfmtGFX10::UFMT_32_FLOAT
@ UFMT_32_FLOAT
Definition SIDefines.h:691

llvm::AMDGPU::UfmtGFX11::UFMT_32_FLOAT
@ UFMT_32_FLOAT
Definition SIDefines.h:791

llvm::AMDGPU::VGPRIndexMode::SRC0_ENABLE
@ SRC0_ENABLE
Definition SIDefines.h:307

llvm::AMDGPU::VGPRIndexMode::DST_ENABLE
@ DST_ENABLE
Definition SIDefines.h:310

llvm::AMDGPU::VirtRegFlag::WWM_REG
@ WWM_REG
Definition SIDefines.h:1101

llvm::AMDGPU
Definition AMDGPUMetadataVerifier.h:34

llvm::AMDGPU::isPackedFP32Inst
bool isPackedFP32Inst(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:3522

llvm::AMDGPU::isInlinableLiteralBF16
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:2988

llvm::AMDGPU::RSRC_DATA_FORMAT
const uint64_t RSRC_DATA_FORMAT
Definition SIInstrInfo.h:1706

llvm::AMDGPU::getBasicFromSDWAOp
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)

llvm::AMDGPU::getMIMGInfo
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

llvm::AMDGPU::isInlinableLiteralFP16
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3009

llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)

llvm::AMDGPU::getWMMAIsXDL
bool getWMMAIsXDL(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:573

llvm::AMDGPU::mapWMMA2AddrTo3AddrOpcode
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:783

llvm::AMDGPU::isInlinableLiteralV2I16
bool isInlinableLiteralV2I16(uint32_t Literal)
Definition AMDGPUBaseInfo.cpp:3139

llvm::AMDGPU::isHi16Reg
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
Definition AMDGPUBaseInfo.cpp:2611

llvm::AMDGPU::isInlinableLiteralV2BF16
bool isInlinableLiteralV2BF16(uint32_t Literal)
Definition AMDGPUBaseInfo.cpp:3144

llvm::AMDGPU::getFlatScratchInstSVfromSS
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)

llvm::AMDGPU::getNumFlatOffsetBits
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
Definition AMDGPUBaseInfo.cpp:3291

llvm::AMDGPU::isGFX12Plus
bool isGFX12Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2525

llvm::AMDGPU::isInlinableLiteralV2F16
bool isInlinableLiteralV2F16(uint32_t Literal)
Definition AMDGPUBaseInfo.cpp:3149

llvm::AMDGPU::getGlobalVaddrOp
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:129

llvm::AMDGPU::isValid32BitLiteral
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
Definition AMDGPUBaseInfo.cpp:3153

llvm::AMDGPU::isLegalDPALU_DPPControl
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
Definition AMDGPUBaseInfo.h:1766

llvm::AMDGPU::getMAIIsGFX940XDL
bool getMAIIsGFX940XDL(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:568

llvm::AMDGPU::RSRC_ELEMENT_SIZE_SHIFT
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition SIInstrInfo.h:1707

llvm::AMDGPU::getAddr64Inst
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)

llvm::AMDGPU::isIntrinsicAlwaysUniform
bool isIntrinsicAlwaysUniform(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3326

llvm::AMDGPU::getMFMAEarlyClobberOp
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)

llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
Definition AMDGPUBaseInfo.h:414

llvm::AMDGPU::getMIMGDimInfoByEncoding
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)

llvm::AMDGPU::isInlinableLiteral32
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:2962

llvm::AMDGPU::RSRC_TID_ENABLE
const uint64_t RSRC_TID_ENABLE
Definition SIInstrInfo.h:1709

llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3322

llvm::AMDGPU::isSISrcOperand
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
Definition AMDGPUBaseInfo.h:1594

llvm::AMDGPU::isGenericAtomic
bool isGenericAtomic(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:715

llvm::AMDGPU::isInlinableIntLiteral
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
Definition AMDGPUBaseInfo.h:1674

llvm::AMDGPU::isDPALU_DPP
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
Definition AMDGPUBaseInfo.cpp:3507

llvm::AMDGPU::getCommuteRev
LLVM_READONLY int getCommuteRev(uint16_t Opcode)

llvm::AMDGPU::getAddrSizeMIMGOp
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
Definition AMDGPUBaseInfo.cpp:323

llvm::AMDGPU::OPERAND_KIMM32
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231

llvm::AMDGPU::OPERAND_REG_IMM_INT64
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202

llvm::AMDGPU::OPERAND_REG_IMM_V2FP16
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209

llvm::AMDGPU::OPERAND_REG_INLINE_C_FP64
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222

llvm::AMDGPU::OPERAND_REG_INLINE_C_BF16
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219

llvm::AMDGPU::OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224

llvm::AMDGPU::OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210

llvm::AMDGPU::OPERAND_REG_IMM_BF16
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206

llvm::AMDGPU::OPERAND_REG_IMM_INT32
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201

llvm::AMDGPU::OPERAND_REG_IMM_V2BF16
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208

llvm::AMDGPU::OPERAND_REG_IMM_FP16
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207

llvm::AMDGPU::OPERAND_REG_INLINE_C_INT64
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218

llvm::AMDGPU::OPERAND_KIMM64
@ OPERAND_KIMM64
Definition SIDefines.h:233

llvm::AMDGPU::OPERAND_KIMM16
@ OPERAND_KIMM16
Definition SIDefines.h:232

llvm::AMDGPU::OPERAND_REG_INLINE_C_INT16
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216

llvm::AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211

llvm::AMDGPU::OPERAND_REG_IMM_FP64
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205

llvm::AMDGPU::OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225

llvm::AMDGPU::OPERAND_REG_INLINE_AC_INT32
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236

llvm::AMDGPU::OPERAND_REG_INLINE_AC_FP32
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237

llvm::AMDGPU::OPERAND_REG_IMM_V2INT32
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212

llvm::AMDGPU::OPERAND_SDWA_VOPC_DST
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248

llvm::AMDGPU::OPERAND_REG_IMM_FP32
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204

llvm::AMDGPU::OPERAND_INPUT_MODS
@ OPERAND_INPUT_MODS
Definition SIDefines.h:245

llvm::AMDGPU::OPERAND_REG_INLINE_C_FP32
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221

llvm::AMDGPU::OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217

llvm::AMDGPU::OPERAND_REG_INLINE_C_V2INT16
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223

llvm::AMDGPU::OPERAND_INLINE_C_AV64_PSEUDO
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242

llvm::AMDGPU::OPERAND_REG_IMM_V2FP32
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213

llvm::AMDGPU::OPERAND_REG_INLINE_AC_FP64
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238

llvm::AMDGPU::OPERAND_REG_INLINE_C_FP16
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220

llvm::AMDGPU::OPERAND_REG_IMM_INT16
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203

llvm::AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228

llvm::AMDGPU::TI_SCRATCH_RSRC_DWORD1
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569

llvm::AMDGPU::TI_SCRATCH_RSRC_DWORD3
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571

llvm::AMDGPU::TI_SCRATCH_RSRC_DWORD0
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568

llvm::AMDGPU::TI_SCRATCH_RSRC_DWORD2
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570

llvm::AMDGPU::TI_CONSTDATA_START
@ TI_CONSTDATA_START
Definition AMDGPU.h:567

llvm::AMDGPU::getCommuteOrig
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)

llvm::AMDGPU::getRegBitWidth
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
Definition SIRegisterInfo.cpp:3280

llvm::AMDGPU::isGFX1250
bool isGFX1250(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2529

llvm::AMDGPU::getMCOpcode
int getMCOpcode(uint16_t Opcode, unsigned Gen)
Definition AMDGPUBaseInfo.cpp:796

llvm::AMDGPU::supportsScaleOffset
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
Definition AMDGPUBaseInfo.cpp:3454

llvm::AMDGPU::RSRC_INDEX_STRIDE_SHIFT
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition SIInstrInfo.h:1708

llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::isInlinableLiteralI16
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3005

llvm::AMDGPU::isGraphics
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1443

llvm::AMDGPU::isInlinableLiteral64
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
Definition AMDGPUBaseInfo.cpp:2945

llvm::AMDGPU::getIfAddr64Inst
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.

llvm::CallingConv::AMDGPU_CS
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition CallingConv.h:197

llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition CallingConv.h:188

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition CallingConv.h:206

llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition CallingConv.h:191

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition CallingConv.h:194

llvm::CallingConv::Fast
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41

llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition CallingConv.h:218

llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition CallingConv.h:213

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::MCOI::OPERAND_GENERIC_4
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71

llvm::MCOI::OPERAND_PCREL
@ OPERAND_PCREL
Definition MCInstrDesc.h:64

llvm::MCOI::OPERAND_GENERIC_2
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69

llvm::MCOI::OPERAND_GENERIC_1
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68

llvm::MCOI::OPERAND_REGISTER
@ OPERAND_REGISTER
Definition MCInstrDesc.h:62

llvm::MCOI::OPERAND_GENERIC_3
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70

llvm::MCOI::OPERAND_IMMEDIATE
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61

llvm::MCOI::OPERAND_MEMORY
@ OPERAND_MEMORY
Definition MCInstrDesc.h:63

llvm::MCOI::OPERAND_UNKNOWN
@ OPERAND_UNKNOWN
Definition MCInstrDesc.h:60

llvm::MCOI::OPERAND_GENERIC_0
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67

llvm::MCOI::OPERAND_GENERIC_5
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72

llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition MachineInstrBuilder.h:49

llvm::RegState::Dead
@ Dead
Unused definition.
Definition MachineInstrBuilder.h:53

llvm::RegState::Define
@ Define
Register definition.
Definition MachineInstrBuilder.h:47

llvm::RegState::ImplicitDefine
@ ImplicitDefine
Definition MachineInstrBuilder.h:66

llvm::RegState::Kill
@ Kill
The last use of a register.
Definition MachineInstrBuilder.h:51

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:55

llvm::SDPatternMatch::Not
Not(const Pred &P) -> Not< Pred >

llvm::SIEncodingFamily::SDWA
@ SDWA
Definition SIDefines.h:38

llvm::SIEncodingFamily::GFX10
@ GFX10
Definition SIDefines.h:42

llvm::SIEncodingFamily::VI
@ VI
Definition SIDefines.h:37

llvm::SIEncodingFamily::GFX1250
@ GFX1250
Definition SIDefines.h:48

llvm::SIEncodingFamily::SI
@ SI
Definition SIDefines.h:36

llvm::SIEncodingFamily::GFX90A
@ GFX90A
Definition SIDefines.h:44

llvm::SIEncodingFamily::GFX9
@ GFX9
Definition SIDefines.h:41

llvm::SIEncodingFamily::GFX11
@ GFX11
Definition SIDefines.h:46

llvm::SIEncodingFamily::SDWA9
@ SDWA9
Definition SIDefines.h:39

llvm::SIEncodingFamily::GFX80
@ GFX80
Definition SIDefines.h:40

llvm::SIEncodingFamily::GFX12
@ GFX12
Definition SIDefines.h:47

llvm::SIEncodingFamily::GFX940
@ GFX940
Definition SIDefines.h:45

llvm::SIEncodingFamily::SDWA10
@ SDWA10
Definition SIDefines.h:43

llvm::SIInstrFlags::SDWA
@ SDWA
Definition SIDefines.h:76

llvm::SIInstrFlags::FlatScratch
@ FlatScratch
Definition SIDefines.h:158

llvm::SIInstrFlags::FLAT
@ FLAT
Definition SIDefines.h:88

llvm::SIInstrFlags::D16Buf
@ D16Buf
Definition SIDefines.h:140

llvm::SISrcMods::SEXT
@ SEXT
Definition SIDefines.h:275

llvm::SISrcMods::ABS
@ ABS
Definition SIDefines.h:274

llvm::SISrcMods::OP_SEL_0
@ OP_SEL_0
Definition SIDefines.h:277

llvm::SISrcMods::OP_SEL_1
@ OP_SEL_1
Definition SIDefines.h:278

llvm::SISrcMods::NEG
@ NEG
Definition SIDefines.h:273

llvm::TargetStackID::SGPRSpill
@ SGPRSpill
Definition TargetFrameLowering.h:32

llvm::cl::ReallyHidden
@ ReallyHidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:495

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310

llvm::ThreadPriority::Low
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262

llvm::Offset
@ Offset
Definition DWP.cpp:477

llvm::finalizeBundle
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
Definition MachineInstrBundle.cpp:119

llvm::getRegSubRegPair
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition SIInstrInfo.h:1592

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707

llvm::maxUIntN
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:369

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174

llvm::execMayBeModifiedBeforeUse
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
Definition SIInstrInfo.cpp:10064

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:626

llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557

llvm::isPowerOf2_64
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293

llvm::Desc
Op::Description Desc
Definition DWARFExpressionPrinter.cpp:23

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186

llvm::getRegSequenceSubReg
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
Definition SIInstrInfo.cpp:9992

llvm::MONoClobber
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44

llvm::has_single_bit
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:400

llvm::getImm
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition SPIRVUtils.cpp:976

llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition PointerIntPair.h:268

llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::getVRegSubRegDef
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
Definition SIInstrInfo.cpp:10029

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1123

llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::MOCooperative
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71

llvm::Data
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189

llvm::getUndefRegState
unsigned getUndefRegState(bool B)
Definition MachineInstrBuilder.h:549

llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
Definition IVDescriptors.h:43

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:38

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::getKillRegState
unsigned getKillRegState(bool B)
Definition MachineInstrBuilder.h:543

llvm::isTargetSpecificOpcode
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition TargetOpcodes.h:36

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::ReplacementType::Format
@ Format
Definition FormatVariadic.h:47

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::DefaultMemoryClusterDWordsLimit
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:223

llvm::DS_Error
@ DS_Error
Definition DiagnosticInfo.h:52

llvm::isIntN
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257

llvm::MOLastUse
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48

llvm::reverseBits
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879

llvm::SignExtend64
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583

llvm::maskTrailingOnes
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6658

llvm::InstructionUniformity
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18

llvm::InstructionUniformity::AlwaysUniform
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23

llvm::InstructionUniformity::NeverUniform
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26

llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::MachineCycleInfo
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
Definition MachineCycleAnalysis.h:25

llvm::MachineCycle
MachineCycleInfo::CycleT MachineCycle
Definition MachineCycleAnalysis.h:26

llvm::popcount
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154

llvm::execMayBeModifiedBeforeAnyUse
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
Definition SIInstrInfo.cpp:10097

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853

N
#define N

llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition AMDGPUBaseInfo.h:421

llvm::AMDGPU::MIMGDimInfo
Definition AMDGPUBaseInfo.h:448

llvm::AMDGPU::MIMGInfo
Definition AMDGPUBaseInfo.h:524

llvm::APFloatBase::SemanticsToEnum
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219

llvm::APFloatBase::S_IEEEsingle
@ S_IEEEsingle
Definition APFloat.h:156

llvm::APFloatBase::S_IEEEhalf
@ S_IEEEhalf
Definition APFloat.h:154

llvm::APFloatBase::S_BFloat
@ S_BFloat
Definition APFloat.h:155

llvm::APFloatBase::S_IEEEdouble
@ S_IEEEdouble
Definition APFloat.h:157

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85

llvm::DestSourcePair
Definition TargetInstrInfo.h:75

llvm::LiveVariables::VarInfo::AliveBlocks
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition LiveVariables.h:84

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition MachineMemOperand.h:42

llvm::MachinePointerInfo::getFixedStack
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Definition MachineOperand.cpp:1064

llvm::MemOp
Definition TargetLowering.h:118

llvm::SIInstrWorklist
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56

llvm::SIInstrWorklist::top
MachineInstr * top() const
Definition SIInstrInfo.h:61

llvm::SIInstrWorklist::empty
bool empty() const
Definition SIInstrInfo.h:71

llvm::SIInstrWorklist::erase_top
void erase_top()
Definition SIInstrInfo.h:66

llvm::SIInstrWorklist::isDeferred
bool isDeferred(MachineInstr *MI)
Definition SIInstrInfo.cpp:7482

llvm::SIInstrWorklist::getDeferredList
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80

llvm::SIInstrWorklist::insert
void insert(MachineInstr *MI)
Definition SIInstrInfo.cpp:7472

llvm::TargetInstrInfo::RegSubRegPair
A pair composed of a register and a sub-register index.
Definition TargetInstrInfo.h:527

llvm::TargetInstrInfo::RegSubRegPair::SubReg
unsigned SubReg
Definition TargetInstrInfo.h:529

llvm::TargetInstrInfo::RegSubRegPair::Reg
Register Reg
Definition TargetInstrInfo.h:528

llvm::cl::desc
Definition CommandLine.h:410