LLVM: lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp Source File

//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

/// \file

/// This file implements the targeting of the RegisterBankInfo class for

/// AMDGPU.

///

/// \par

///

/// AMDGPU has unique register bank constraints that require special high level

/// strategies to deal with. There are two main true physical register banks

/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a

/// sort of pseudo-register bank needed to represent SGPRs used in a vector

/// boolean context. There is also the AGPR bank, which is a special purpose

/// physical register bank present on some subtargets.

///

/// Copying from VGPR to SGPR is generally illegal, unless the value is known to

/// be uniform. It is generally not valid to legalize operands by inserting

/// copies as on other targets. Operations which require uniform, SGPR operands

/// generally require scalarization by repeatedly executing the instruction,

/// activating each set of lanes using a unique set of input values. This is

/// referred to as a waterfall loop.

///

/// \par Booleans

///

/// Booleans (s1 values) requires special consideration. A vector compare result

/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit

/// register. These are represented with the VCC bank. During selection, we need

/// to be able to unambiguously go back from a register class to a register

/// bank. To distinguish whether an SGPR should use the SGPR or VCC register

/// bank, we need to know the use context type. An SGPR s1 value always means a

/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets

/// SCC, which is a 1-bit unaddressable register. This will need to be copied to

/// a 32-bit virtual register. Taken together, this means we need to adjust the

/// type of boolean operations to be regbank legal. All SALU booleans need to be

/// widened to 32-bits, and all VALU booleans need to be s1 values.

///

/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact

/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc

/// bank. A non-boolean source (such as a truncate from a 1-bit load from

/// memory) will require a copy to the VCC bank which will require clearing the

/// high bits and inserting a compare.

///

/// \par Constant bus restriction

///

/// VALU instructions have a limitation known as the constant bus

/// restriction. Most VALU instructions can use SGPR operands, but may read at

/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most

/// instructions). This is one unique SGPR, so the same SGPR may be used for

/// multiple operands. From a register bank perspective, any combination of

/// operands should be legal as an SGPR, but this is contextually dependent on

/// the SGPR operands all being the same register. There is therefore optimal to

/// choose the SGPR with the most uses to minimize the number of copies.

///

/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*

/// operation should have its source operands all mapped to VGPRs (except for

/// VCC), inserting copies from any SGPR operands. This the most trivial legal

/// mapping. Anything beyond the simplest 1:1 instruction selection would be too

/// complicated to solve here. Every optimization pattern or instruction

/// selected to multiple outputs would have to enforce this rule, and there

/// would be additional complexity in tracking this rule for every G_*

/// operation. By forcing all inputs to VGPRs, it also simplifies the task of

/// picking the optimal operand combination from a post-isel optimization pass.

///

//===----------------------------------------------------------------------===//


#include "AMDGPURegisterBankInfo.h"


#include "AMDGPU.h"

#include "AMDGPUGlobalISelUtils.h"

#include "AMDGPUInstrInfo.h"

#include "GCNSubtarget.h"

#include "SIMachineFunctionInfo.h"

#include "SIRegisterInfo.h"

#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"

#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"

#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"

#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"

#include "llvm/CodeGen/RegisterBank.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"


#define GET_TARGET_REGBANK_IMPL

#include "AMDGPUGenRegisterBank.inc"


// This file will be TableGen'ed at some point.

#include "AMDGPUGenRegisterBankInfo.def"


using namespace llvm;

using namespace MIPatternMatch;


namespace {


// Observer to apply a register bank to new registers created by LegalizerHelper.

class ApplyRegBankMapping final : public GISelChangeObserver {

private:

  MachineIRBuilder &B;

  const AMDGPURegisterBankInfo &RBI;

  MachineRegisterInfo &MRI;

  const RegisterBank *NewBank;

  SmallVector<MachineInstr *, 4> NewInsts;


public:

  ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,

                      MachineRegisterInfo &MRI_, const RegisterBank *RB)

      : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {

    assert(!B.isObservingChanges());

    B.setChangeObserver(*this);

  }


  ~ApplyRegBankMapping() override {

    for (MachineInstr *MI : NewInsts)

      applyBank(*MI);


    B.stopObservingChanges();

  }


  /// Set any registers that don't have a set register class or bank to SALU.

  void applyBank(MachineInstr &MI) {

    const unsigned Opc = MI.getOpcode();

    if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||

        Opc == AMDGPU::G_SEXT) {

      // LegalizerHelper wants to use the basic legalization artifacts when

      // widening etc. We don't handle selection with vcc in artifact sources,

      // so we need to use a select instead to handle these properly.

      Register DstReg = MI.getOperand(0).getReg();

      Register SrcReg = MI.getOperand(1).getReg();

      const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);

      if (SrcBank == &AMDGPU::VCCRegBank) {

        const LLT S32 = LLT::scalar(32);

        assert(MRI.getType(SrcReg) == LLT::scalar(1));

        assert(MRI.getType(DstReg) == S32);

        assert(NewBank == &AMDGPU::VGPRRegBank);


        // Replace the extension with a select, which really uses the boolean

        // source.

        B.setInsertPt(*MI.getParent(), MI);


        auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);

        auto False = B.buildConstant(S32, 0);

        B.buildSelect(DstReg, SrcReg, True, False);

        MRI.setRegBank(True.getReg(0), *NewBank);

        MRI.setRegBank(False.getReg(0), *NewBank);

        MI.eraseFromParent();

      }


      assert(!MRI.getRegClassOrRegBank(DstReg));

      MRI.setRegBank(DstReg, *NewBank);

      return;

    }


#ifndef NDEBUG

    if (Opc == AMDGPU::G_TRUNC) {

      Register DstReg = MI.getOperand(0).getReg();

      const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);

      assert(DstBank != &AMDGPU::VCCRegBank);

    }

#endif


    for (MachineOperand &Op : MI.operands()) {

      if (!Op.isReg())

        continue;


      // We may see physical registers if building a real MI

      Register Reg = Op.getReg();

      if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))

        continue;


      const RegisterBank *RB = NewBank;

      if (MRI.getType(Reg) == LLT::scalar(1)) {

        assert(NewBank == &AMDGPU::VGPRRegBank &&

               "s1 operands should only be used for vector bools");

        assert((MI.getOpcode() != AMDGPU::G_TRUNC &&

                MI.getOpcode() != AMDGPU::G_ANYEXT) &&

               "not expecting legalization artifacts here");

        RB = &AMDGPU::VCCRegBank;

      }


      MRI.setRegBank(Reg, *RB);

    }

  }


  void erasingInstr(MachineInstr &MI) override {}


  void createdInstr(MachineInstr &MI) override {

    // At this point, the instruction was just inserted and has no operands.

    NewInsts.push_back(&MI);

  }


  void changingInstr(MachineInstr &MI) override {}

  void changedInstr(MachineInstr &MI) override {

    // FIXME: In principle we should probably add the instruction to NewInsts,

    // but the way the LegalizerHelper uses the observer, we will always see the

    // registers we need to set the regbank on also referenced in a new

    // instruction.

  }

};


} // anonymous namespace


AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)

    : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),

      TII(Subtarget.getInstrInfo()) {


  // HACK: Until this is fully tablegen'd.

  static llvm::once_flag InitializeRegisterBankFlag;


  static auto InitializeRegisterBankOnce = [this]() {

    assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&

           &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&

           &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);

    (void)this;

  };


  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);

}


static bool isVectorRegisterBank(const RegisterBank &Bank) {

  unsigned BankID = Bank.getID();

  return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;

}


bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {

  return RB != &AMDGPU::SGPRRegBank;

}


unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,

                                          const RegisterBank &Src,

                                          TypeSize Size) const {

  // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?

  if (Dst.getID() == AMDGPU::SGPRRegBankID &&

      (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {

    return std::numeric_limits<unsigned>::max();

  }


  // Bool values are tricky, because the meaning is based on context. The SCC

  // and VCC banks are for the natural scalar and vector conditions produced by

  // a compare.

  //

  // Legalization doesn't know about the necessary context, so an s1 use may

  // have been a truncate from an arbitrary value, in which case a copy (lowered

  // as a compare with 0) needs to be inserted.

  if (Size == 1 &&

      (Dst.getID() == AMDGPU::SGPRRegBankID) &&

      (isVectorRegisterBank(Src) ||

       Src.getID() == AMDGPU::SGPRRegBankID ||

       Src.getID() == AMDGPU::VCCRegBankID))

    return std::numeric_limits<unsigned>::max();


  // There is no direct copy between AGPRs.

  if (Dst.getID() == AMDGPU::AGPRRegBankID &&

      Src.getID() == AMDGPU::AGPRRegBankID)

    return 4;


  return RegisterBankInfo::copyCost(Dst, Src, Size);

}


unsigned AMDGPURegisterBankInfo::getBreakDownCost(

  const ValueMapping &ValMapping,

  const RegisterBank *CurBank) const {

  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to

  // VGPR.

  // FIXME: Is there a better way to do this?

  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)

    return 10; // This is expensive.


  assert(ValMapping.NumBreakDowns == 2 &&

         ValMapping.BreakDown[0].Length == 32 &&

         ValMapping.BreakDown[0].StartIdx == 0 &&

         ValMapping.BreakDown[1].Length == 32 &&

         ValMapping.BreakDown[1].StartIdx == 32 &&

         ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);


  // 32-bit extract of a 64-bit value is just access of a subregister, so free.

  // TODO: Cost of 0 hits assert, though it's not clear it's what we really

  // want.


  // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR

  // alignment restrictions, but this probably isn't important.

  return 1;

}


const RegisterBank &

AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,

                                               LLT Ty) const {

  if (&RC == &AMDGPU::SReg_1RegClass)

    return AMDGPU::VCCRegBank;


  // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a

  // VCC-like use.

  if (TRI->isSGPRClass(&RC)) {

    // FIXME: This probably came from a copy from a physical register, which

    // should be inferable from the copied to-type. We don't have many boolean

    // physical register constraints so just assume a normal SGPR for now.

    if (!Ty.isValid())

      return AMDGPU::SGPRRegBank;


    return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;

  }


  return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;

}


template <unsigned NumOps>

RegisterBankInfo::InstructionMappings

AMDGPURegisterBankInfo::addMappingFromTable(

    const MachineInstr &MI, const MachineRegisterInfo &MRI,

    const std::array<unsigned, NumOps> RegSrcOpIdx,

    ArrayRef<OpRegBankEntry<NumOps>> Table) const {


  InstructionMappings AltMappings;


  SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());


  unsigned Sizes[NumOps];

  for (unsigned I = 0; I < NumOps; ++I) {

    Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();

    Sizes[I] = getSizeInBits(Reg, MRI, *TRI);

  }


  for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {

    unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);

    Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);

  }


  // getInstrMapping's default mapping uses ID 1, so start at 2.

  unsigned MappingID = 2;

  for (const auto &Entry : Table) {

    for (unsigned I = 0; I < NumOps; ++I) {

      int OpIdx = RegSrcOpIdx[I];

      Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);

    }


    AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,

                                                 getOperandsMapping(Operands),

                                                 Operands.size()));

  }


  return AltMappings;

}


RegisterBankInfo::InstructionMappings

AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(

    const MachineInstr &MI, const MachineRegisterInfo &MRI) const {

  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

  case Intrinsic::amdgcn_readlane: {

    static const OpRegBankEntry<3> Table[2] = {

      // Perfectly legal.

      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },


      // Need a readfirstlane for the index.

      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }

    };


    const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };

    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);

  }

  case Intrinsic::amdgcn_writelane: {

    static const OpRegBankEntry<4> Table[4] = {

      // Perfectly legal.

      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },


      // Need readfirstlane of first op

      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },


      // Need readfirstlane of second op

      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },


      // Need readfirstlane of both ops

      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }

    };


    // rsrc, voffset, offset

    const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };

    return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);

  }

  default:

    return RegisterBankInfo::getInstrAlternativeMappings(MI);

  }

}


RegisterBankInfo::InstructionMappings

AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(

    const MachineInstr &MI, const MachineRegisterInfo &MRI) const {


  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

  case Intrinsic::amdgcn_s_buffer_load: {

    static const OpRegBankEntry<2> Table[4] = {

      // Perfectly legal.

      { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },


      // Only need 1 register in loop

      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },


      // Have to waterfall the resource.

      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },


      // Have to waterfall the resource, and the offset.

      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }

    };


    // rsrc, offset

    const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };

    return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);

  }

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap: {

    // VGPR = M0, VGPR

    static const OpRegBankEntry<3> Table[2] = {

      // Perfectly legal.

      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },


      // Need a readfirstlane for m0

      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }

    };


    const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };

    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);

  }

  case Intrinsic::amdgcn_s_sendmsg:

  case Intrinsic::amdgcn_s_sendmsghalt: {

    // FIXME: Should have no register for immediate

    static const OpRegBankEntry<1> Table[2] = {

      // Perfectly legal.

      { { AMDGPU::SGPRRegBankID }, 1 },


      // Need readlane

      { { AMDGPU::VGPRRegBankID }, 3 }

    };


    const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };

    return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);

  }

  default:

    return RegisterBankInfo::getInstrAlternativeMappings(MI);

  }

}


// FIXME: Returns uniform if there's no source value information. This is

// probably wrong.

bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {

  if (!MI.hasOneMemOperand())

    return false;


  const MachineMemOperand *MMO = *MI.memoperands_begin();

  const unsigned AS = MMO->getAddrSpace();

  const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||

                       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;

  const unsigned MemSize = 8 * MMO->getSize().getValue();


  // Require 4-byte alignment.

  return (MMO->getAlign() >= Align(4) ||

          (Subtarget.hasScalarSubwordLoads() &&

           ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||

            (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&

         // Can't do a scalar atomic load.

         !MMO->isAtomic() &&

         // Don't use scalar loads for volatile accesses to non-constant address

         // spaces.

         (IsConst || !MMO->isVolatile()) &&

         // Memory must be known constant, or not written before this load.

         (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&

         AMDGPU::isUniformMMO(MMO);

}


RegisterBankInfo::InstructionMappings

AMDGPURegisterBankInfo::getInstrAlternativeMappings(

    const MachineInstr &MI) const {


  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();


  InstructionMappings AltMappings;

  switch (MI.getOpcode()) {

  case TargetOpcode::G_CONSTANT:

  case TargetOpcode::G_IMPLICIT_DEF: {

    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

    if (Size == 1) {

      static const OpRegBankEntry<1> Table[3] = {

        { { AMDGPU::VGPRRegBankID }, 1 },

        { { AMDGPU::SGPRRegBankID }, 1 },

        { { AMDGPU::VCCRegBankID }, 1 }

      };


      return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);

    }


    [[fallthrough]];

  }

  case TargetOpcode::G_FCONSTANT:

  case TargetOpcode::G_FRAME_INDEX:

  case TargetOpcode::G_GLOBAL_VALUE: {

    static const OpRegBankEntry<1> Table[2] = {

      { { AMDGPU::VGPRRegBankID }, 1 },

      { { AMDGPU::SGPRRegBankID }, 1 }

    };


    return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);

  }

  case TargetOpcode::G_AND:

  case TargetOpcode::G_OR:

  case TargetOpcode::G_XOR: {

    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);


    if (Size == 1) {

      // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.

      const InstructionMapping &SCCMapping = getInstructionMapping(

        1, 1, getOperandsMapping(

          {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),

           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),

           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),

        3); // Num Operands

      AltMappings.push_back(&SCCMapping);


      const InstructionMapping &VCCMapping0 = getInstructionMapping(

        2, 1, getOperandsMapping(

          {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),

           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),

           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),

        3); // Num Operands

      AltMappings.push_back(&VCCMapping0);

      return AltMappings;

    }


    if (Size != 64)

      break;


    const InstructionMapping &SSMapping = getInstructionMapping(

      1, 1, getOperandsMapping(

        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),

      3); // Num Operands

    AltMappings.push_back(&SSMapping);


    const InstructionMapping &VVMapping = getInstructionMapping(

      2, 2, getOperandsMapping(

        {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),

         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),

         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),

      3); // Num Operands

    AltMappings.push_back(&VVMapping);

    break;

  }

  case TargetOpcode::G_LOAD:

  case TargetOpcode::G_ZEXTLOAD:

  case TargetOpcode::G_SEXTLOAD: {

    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

    LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());

    unsigned PtrSize = PtrTy.getSizeInBits();

    unsigned AS = PtrTy.getAddressSpace();


    if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&

         AS != AMDGPUAS::PRIVATE_ADDRESS) &&

        isScalarLoadLegal(MI)) {

      const InstructionMapping &SSMapping = getInstructionMapping(

          1, 1, getOperandsMapping(

                    {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

                     AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),

          2); // Num Operands

      AltMappings.push_back(&SSMapping);

    }


    const InstructionMapping &VVMapping = getInstructionMapping(

        2, 1,

        getOperandsMapping(

            {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),

             AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),

        2); // Num Operands

    AltMappings.push_back(&VVMapping);


    // It may be possible to have a vgpr = load sgpr mapping here, because

    // the mubuf instructions support this kind of load, but probably for only

    // gfx7 and older.  However, the addressing mode matching in the instruction

    // selector should be able to do a better job of detecting and selecting

    // these kinds of loads from the vgpr = load vgpr mapping.


    return AltMappings;


  }

  case TargetOpcode::G_SELECT: {

    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,

      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),

                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),

      4); // Num Operands

    AltMappings.push_back(&SSMapping);


    const InstructionMapping &VVMapping = getInstructionMapping(2, 1,

      getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),

                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),

                          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),

                          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),

      4); // Num Operands

    AltMappings.push_back(&VVMapping);


    return AltMappings;

  }

  case TargetOpcode::G_UADDE:

  case TargetOpcode::G_USUBE:

  case TargetOpcode::G_SADDE:

  case TargetOpcode::G_SSUBE: {

    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,

      getOperandsMapping(

        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),

         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),

         AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),

      5); // Num Operands

    AltMappings.push_back(&SSMapping);


    const InstructionMapping &VVMapping = getInstructionMapping(2, 1,

      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),

                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),

                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),

                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),

                          AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),

      5); // Num Operands

    AltMappings.push_back(&VVMapping);

    return AltMappings;

  }

  case AMDGPU::G_BRCOND: {

    assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);


    // TODO: Change type to 32 for scalar

    const InstructionMapping &SMapping = getInstructionMapping(

      1, 1, getOperandsMapping(

        {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),

      2); // Num Operands

    AltMappings.push_back(&SMapping);


    const InstructionMapping &VMapping = getInstructionMapping(

      1, 1, getOperandsMapping(

        {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),

      2); // Num Operands

    AltMappings.push_back(&VMapping);

    return AltMappings;

  }

  case AMDGPU::G_INTRINSIC:

  case AMDGPU::G_INTRINSIC_CONVERGENT:

    return getInstrAlternativeMappingsIntrinsic(MI, MRI);

  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:

  case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:

    return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);

  default:

    break;

  }

  return RegisterBankInfo::getInstrAlternativeMappings(MI);

}


void AMDGPURegisterBankInfo::split64BitValueForMapping(

  MachineIRBuilder &B,

  SmallVector<Register, 2> &Regs,

  LLT HalfTy,

  Register Reg) const {

  assert(HalfTy.getSizeInBits() == 32);

  MachineRegisterInfo *MRI = B.getMRI();

  Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);

  Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);

  const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);

  MRI->setRegBank(LoLHS, *Bank);

  MRI->setRegBank(HiLHS, *Bank);


  Regs.push_back(LoLHS);

  Regs.push_back(HiLHS);


  B.buildInstr(AMDGPU::G_UNMERGE_VALUES)

    .addDef(LoLHS)

    .addDef(HiLHS)

    .addUse(Reg);

}


/// Replace the current type each register in \p Regs has with \p NewTy

static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,

                          LLT NewTy) {

  for (Register Reg : Regs) {

    assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());

    MRI.setType(Reg, NewTy);

  }

}


static LLT getHalfSizedType(LLT Ty) {

  if (Ty.isVector()) {

    assert(Ty.getElementCount().isKnownMultipleOf(2));

    return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),

                               Ty.getElementType());

  }


  assert(Ty.getScalarSizeInBits() % 2 == 0);

  return LLT::scalar(Ty.getScalarSizeInBits() / 2);

}


// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector

// source value into a scalar register.

Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,

                                                    MachineRegisterInfo &MRI,

                                                    Register Src) const {

  LLT Ty = MRI.getType(Src);

  const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);


  if (Bank == &AMDGPU::SGPRRegBank)

    return Src;


  unsigned Bits = Ty.getSizeInBits();

  assert(Bits % 32 == 0);


  if (Bank != &AMDGPU::VGPRRegBank) {

    // We need to copy from AGPR to VGPR

    Src = B.buildCopy(Ty, Src).getReg(0);

    MRI.setRegBank(Src, AMDGPU::VGPRRegBank);

  }


  LLT S32 = LLT::scalar(32);

  unsigned NumParts = Bits / 32;

  SmallVector<Register, 8> SrcParts;

  SmallVector<Register, 8> DstParts;


  if (Bits == 32) {

    SrcParts.push_back(Src);

  } else {

    auto Unmerge = B.buildUnmerge(S32, Src);

    for (unsigned i = 0; i < NumParts; ++i)

      SrcParts.push_back(Unmerge.getReg(i));

  }


  for (unsigned i = 0; i < NumParts; ++i) {

    Register SrcPart = SrcParts[i];

    Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

    MRI.setType(DstPart, NumParts == 1 ? Ty : S32);


    const TargetRegisterClass *Constrained =

        constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);

    (void)Constrained;

    assert(Constrained && "Failed to constrain readfirstlane src reg");


    B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});


    DstParts.push_back(DstPart);

  }


  if (Bits == 32)

    return DstParts[0];


  Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);

  MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);

  return Dst;

}


/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If

/// any of the required SGPR operands are VGPRs, perform a waterfall loop to

/// execute the instruction for each unique combination of values in all lanes

/// in the wave. The block will be split such that rest of the instructions are

/// moved to a new block.

///

/// Essentially performs this loop:

//

/// Save Execution Mask

/// For (Lane : Wavefront) {

///   Enable Lane, Disable all other lanes

///   SGPR = read SGPR value for current lane from VGPR

///   VGPRResult[Lane] = use_op SGPR

/// }

/// Restore Execution Mask

///

/// There is additional complexity to try for compare values to identify the

/// unique values used.

bool AMDGPURegisterBankInfo::executeInWaterfallLoop(

    MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,

    SmallSet<Register, 4> &SGPROperandRegs) const {

  // Track use registers which have already been expanded with a readfirstlane

  // sequence. This may have multiple uses if moving a sequence.

  DenseMap<Register, Register> WaterfalledRegMap;


  MachineBasicBlock &MBB = B.getMBB();

  MachineFunction *MF = &B.getMF();


  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();

  const unsigned MovExecOpc =

      Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;

  const unsigned MovExecTermOpc =

      Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;


  const unsigned XorTermOpc = Subtarget.isWave32() ?

    AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;

  const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?

    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;

  const unsigned ExecReg =  Subtarget.isWave32() ?

    AMDGPU::EXEC_LO : AMDGPU::EXEC;


#ifndef NDEBUG

  const int OrigRangeSize = std::distance(Range.begin(), Range.end());

#endif


  MachineRegisterInfo &MRI = *B.getMRI();

  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);

  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);


  // Don't bother using generic instructions/registers for the exec mask.

  B.buildInstr(TargetOpcode::IMPLICIT_DEF)

    .addDef(InitSaveExecReg);


  Register PhiExec = MRI.createVirtualRegister(WaveRC);

  Register NewExec = MRI.createVirtualRegister(WaveRC);


  // To insert the loop we need to split the block. Move everything before this

  // point to a new block, and insert a new empty block before this instruction.

  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();

  MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();

  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();

  MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();

  MachineFunction::iterator MBBI(MBB);

  ++MBBI;

  MF->insert(MBBI, LoopBB);

  MF->insert(MBBI, BodyBB);

  MF->insert(MBBI, RestoreExecBB);

  MF->insert(MBBI, RemainderBB);


  LoopBB->addSuccessor(BodyBB);

  BodyBB->addSuccessor(RestoreExecBB);

  BodyBB->addSuccessor(LoopBB);


  // Move the rest of the block into a new block.

  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);

  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());


  MBB.addSuccessor(LoopBB);

  RestoreExecBB->addSuccessor(RemainderBB);


  B.setInsertPt(*LoopBB, LoopBB->end());


  B.buildInstr(TargetOpcode::PHI)

      .addDef(PhiExec)

      .addReg(InitSaveExecReg)

      .addMBB(&MBB)

      .addReg(NewExec)

      .addMBB(BodyBB);


  const DebugLoc &DL = B.getDL();


  MachineInstr &FirstInst = *Range.begin();


  // Move the instruction into the loop body. Note we moved everything after

  // Range.end() already into a new block, so Range.end() is no longer valid.

  BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());


  // Figure out the iterator range after splicing the instructions.

  MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();

  auto NewEnd = BodyBB->end();


  B.setMBB(*LoopBB);


  LLT S1 = LLT::scalar(1);

  Register CondReg;


  assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);


  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {

    for (MachineOperand &Op : MI.all_uses()) {

      Register OldReg = Op.getReg();

      if (!SGPROperandRegs.count(OldReg))

        continue;


      // See if we already processed this register in another instruction in the

      // sequence.

      auto OldVal = WaterfalledRegMap.find(OldReg);

      if (OldVal != WaterfalledRegMap.end()) {

        Op.setReg(OldVal->second);

        continue;

      }


      Register OpReg = Op.getReg();

      LLT OpTy = MRI.getType(OpReg);


      const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);

      if (OpBank != &AMDGPU::VGPRRegBank) {

        // Insert copy from AGPR to VGPR before the loop.

        B.setMBB(MBB);

        OpReg = B.buildCopy(OpTy, OpReg).getReg(0);

        MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);

        B.setMBB(*LoopBB);

      }


      Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);


      // Build the comparison(s).

      unsigned OpSize = OpTy.getSizeInBits();

      bool Is64 = OpSize % 64 == 0;

      unsigned PartSize = Is64 ? 64 : 32;

      LLT PartTy = LLT::scalar(PartSize);

      unsigned NumParts = OpSize / PartSize;

      SmallVector<Register, 8> OpParts;

      SmallVector<Register, 8> CurrentLaneParts;


      if (NumParts == 1) {

        OpParts.push_back(OpReg);

        CurrentLaneParts.push_back(CurrentLaneReg);

      } else {

        auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);

        auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);

        for (unsigned i = 0; i < NumParts; ++i) {

          OpParts.push_back(UnmergeOp.getReg(i));

          CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));

          MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);

          MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);

        }

      }


      for (unsigned i = 0; i < NumParts; ++i) {

        auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],

                                  OpParts[i]).getReg(0);

        MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);


        if (!CondReg) {

          CondReg = CmpReg;

        } else {

          CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);

          MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);

        }

      }


      Op.setReg(CurrentLaneReg);


      // Make sure we don't re-process this register again.

      WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));

    }

  }


  // The ballot becomes a no-op during instruction selection.

  CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,

                             {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})

                .addReg(CondReg)

                .getReg(0);

  MRI.setRegClass(CondReg, WaveRC);


  // Update EXEC, save the original EXEC value to VCC.

  B.buildInstr(AndSaveExecOpc)

    .addDef(NewExec)

    .addReg(CondReg, RegState::Kill);


  MRI.setSimpleHint(NewExec, CondReg);


  B.setInsertPt(*BodyBB, BodyBB->end());


  // Update EXEC, switch all done bits to 0 and all todo bits to 1.

  B.buildInstr(XorTermOpc)

    .addDef(ExecReg)

    .addReg(ExecReg)

    .addReg(NewExec);


  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use

  // s_cbranch_scc0?


  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.

  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);


  // Save the EXEC mask before the loop.

  BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)

    .addReg(ExecReg);


  // Restore the EXEC mask after the loop.

  B.setMBB(*RestoreExecBB);

  B.buildInstr(MovExecTermOpc)

    .addDef(ExecReg)

    .addReg(SaveExecReg);


  // Set the insert point after the original instruction, so any new

  // instructions will be in the remainder.

  B.setInsertPt(*RemainderBB, RemainderBB->begin());


  return true;

}


// Return any unique registers used by \p MI at \p OpIndices that need to be

// handled in a waterfall loop. Returns these registers in \p

// SGPROperandRegs. Returns true if there are any operands to handle and a

// waterfall loop is necessary.

bool AMDGPURegisterBankInfo::collectWaterfallOperands(

  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,

  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {

  for (unsigned Op : OpIndices) {

    assert(MI.getOperand(Op).isUse());

    Register Reg = MI.getOperand(Op).getReg();

    const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);

    if (OpBank->getID() != AMDGPU::SGPRRegBankID)

      SGPROperandRegs.insert(Reg);

  }


  // No operands need to be replaced, so no need to loop.

  return !SGPROperandRegs.empty();

}


bool AMDGPURegisterBankInfo::executeInWaterfallLoop(

    MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {

  // Use a set to avoid extra readfirstlanes in the case where multiple operands

  // are the same register.

  SmallSet<Register, 4> SGPROperandRegs;


  if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))

    return false;


  MachineBasicBlock::iterator I = MI.getIterator();

  return executeInWaterfallLoop(B, make_range(I, std::next(I)),

                                SGPROperandRegs);

}


// Legalize an operand that must be an SGPR by inserting a readfirstlane.

void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(

    MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {

  Register Reg = MI.getOperand(OpIdx).getReg();

  MachineRegisterInfo &MRI = *B.getMRI();

  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);

  if (Bank == &AMDGPU::SGPRRegBank)

    return;


  Reg = buildReadFirstLane(B, MRI, Reg);

  MI.getOperand(OpIdx).setReg(Reg);

}


/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the

/// rest will be in the remainder.

static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {

  unsigned TotalSize = Ty.getSizeInBits();

  if (!Ty.isVector())

    return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};


  LLT EltTy = Ty.getElementType();

  unsigned EltSize = EltTy.getSizeInBits();

  assert(FirstSize % EltSize == 0);


  unsigned FirstPartNumElts = FirstSize / EltSize;

  unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;


  return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),

          LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};

}


static LLT widen96To128(LLT Ty) {

  if (!Ty.isVector())

    return LLT::scalar(128);


  LLT EltTy = Ty.getElementType();

  assert(128 % EltTy.getSizeInBits() == 0);

  return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);

}


bool AMDGPURegisterBankInfo::applyMappingLoad(

    MachineIRBuilder &B,

    const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,

    MachineInstr &MI) const {

  MachineRegisterInfo &MRI = *B.getMRI();

  Register DstReg = MI.getOperand(0).getReg();

  const LLT LoadTy = MRI.getType(DstReg);

  unsigned LoadSize = LoadTy.getSizeInBits();

  MachineMemOperand *MMO = *MI.memoperands_begin();

  const unsigned MaxNonSmrdLoadSize = 128;


  const RegisterBank *DstBank =

      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

  if (DstBank == &AMDGPU::SGPRRegBank) {

    // There are some special cases that we need to look at for 32 bit and 96

    // bit SGPR loads otherwise we have nothing to do.

    if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))

      return false;


    const unsigned MemSize = 8 * MMO->getSize().getValue();

    // Scalar loads of size 8 or 16 bit with proper alignment may be widened to

    // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit

    // scalar loads should have a load size of 32 but memory access size of less

    // than 32.

    if (LoadSize == 32 &&

        (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))

      return false;


    if (LoadSize == 32 &&

        ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||

         (MemSize == 16 && MMO->getAlign() >= Align(2))) &&

        isScalarLoadLegal(MI) &&

        Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)

      return false;


    Register PtrReg = MI.getOperand(1).getReg();


    ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);


    if (LoadSize == 32) {

      // This is an extending load from a sub-dword size. Widen the memory

      // access size to 4 bytes and clear the extra high bits appropriately

      const LLT S32 = LLT::scalar(32);

      if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {

        // Must extend the sign bit into higher bits for a G_SEXTLOAD

        auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);

        B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);

      } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {

        // Must extend zero into higher bits with an AND for a G_ZEXTLOAD

        auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);

        B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);

      } else

        // We do not need to touch the higher bits for regular loads.

        B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);

    } else {

      // 96-bit loads are only available for vector loads. We need to split this

      // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).

      if (MMO->getAlign() < Align(16)) {

        LegalizerHelper Helper(B.getMF(), ApplyBank, B);

        LLT Part64, Part32;

        std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);

        if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=

            LegalizerHelper::Legalized)

          return false;

        return true;

      }

      LLT WiderTy = widen96To128(LoadTy);

      auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);

      if (WiderTy.isScalar()) {

        B.buildTrunc(MI.getOperand(0), WideLoad);

      } else {

        B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),

                                            WideLoad);

      }

    }


    MI.eraseFromParent();

    return true;

  }


  // 128-bit loads are supported for all instruction types.

  if (LoadSize <= MaxNonSmrdLoadSize)

    return false;


  SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));


  if (SrcRegs.empty())

    SrcRegs.push_back(MI.getOperand(1).getReg());


  // RegBankSelect only emits scalar types, so we need to reset the pointer

  // operand to a pointer type.

  Register BasePtrReg = SrcRegs[0];

  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());

  MRI.setType(BasePtrReg, PtrTy);


  // The following are the loads not splitted enough during legalization

  // because it was not clear they are smem-load or vmem-load

  if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) ||

      MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {

    assert(LoadSize % MaxNonSmrdLoadSize == 0);

    unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;

    const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);

    ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);

    LegalizerHelper Helper(B.getMF(), O, B);

    if (LoadTy.isVector()) {

      if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=

          LegalizerHelper::Legalized)

        return false;

    } else {

      if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)

        return false;

    }

  }


  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);

  return true;

}


bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(

    MachineIRBuilder &B,

    const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,

    MachineInstr &MI) const {

  MachineRegisterInfo &MRI = *B.getMRI();

  const MachineFunction &MF = B.getMF();

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

  const auto &TFI = *ST.getFrameLowering();


  // Guard in case the stack growth direction ever changes with scratch

  // instructions.

  assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&

         "Stack grows upwards for AMDGPU");


  Register Dst = MI.getOperand(0).getReg();

  Register AllocSize = MI.getOperand(1).getReg();

  Align Alignment = assumeAligned(MI.getOperand(2).getImm());


  const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);


  if (SizeBank != &AMDGPU::SGPRRegBank) {

    auto WaveReduction =

        B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})

            .addUse(AllocSize)

            .addImm(0);

    AllocSize = WaveReduction.getReg(0);

  }


  LLT PtrTy = MRI.getType(Dst);

  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());


  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  Register SPReg = Info->getStackPtrOffsetReg();

  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);


  auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());

  auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);


  auto OldSP = B.buildCopy(PtrTy, SPReg);

  if (Alignment > TFI.getStackAlign()) {

    auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;

    auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,

                              B.buildConstant(LLT::scalar(32), StackAlignMask));

    B.buildMaskLowPtrBits(Dst, Tmp1,

                          Log2(Alignment) + ST.getWavefrontSizeLog2());

  } else {

    B.buildCopy(Dst, OldSP);

  }

  auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);

  B.buildCopy(SPReg, PtrAdd);

  MI.eraseFromParent();

  return true;

}


bool AMDGPURegisterBankInfo::applyMappingImage(

    MachineIRBuilder &B, MachineInstr &MI,

    const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,

    int RsrcIdx) const {

  const int NumDefs = MI.getNumExplicitDefs();


  // The reported argument index is relative to the IR intrinsic call arguments,

  // so we need to shift by the number of defs and the intrinsic ID.

  RsrcIdx += NumDefs + 1;


  // Insert copies to VGPR arguments.

  applyDefaultMapping(OpdMapper);


  // Fixup any SGPR arguments.

  SmallVector<unsigned, 4> SGPRIndexes;

  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {

    if (!MI.getOperand(I).isReg())

      continue;


    // If this intrinsic has a sampler, it immediately follows rsrc.

    if (I == RsrcIdx || I == RsrcIdx + 1)

      SGPRIndexes.push_back(I);

  }


  executeInWaterfallLoop(B, MI, SGPRIndexes);

  return true;

}


// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store

// the three offsets (voffset, soffset and instoffset)

unsigned AMDGPURegisterBankInfo::setBufferOffsets(

    MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,

    Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {

  const LLT S32 = LLT::scalar(32);

  MachineRegisterInfo *MRI = B.getMRI();


  if (std::optional<int64_t> Imm =

          getIConstantVRegSExtVal(CombinedOffset, *MRI)) {

    uint32_t SOffset, ImmOffset;

    if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {

      VOffsetReg = B.buildConstant(S32, 0).getReg(0);

      SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);

      InstOffsetVal = ImmOffset;


      B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);

      B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);

      return SOffset + ImmOffset;

    }

  }


  Register Base;

  unsigned Offset;


  std::tie(Base, Offset) =

      AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);


  uint32_t SOffset, ImmOffset;

  if ((int)Offset > 0 &&

      TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {

    if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {

      VOffsetReg = Base;

      SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);

      B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);

      InstOffsetVal = ImmOffset;

      return 0; // XXX - Why is this 0?

    }


    // If we have SGPR base, we can use it for soffset.

    if (SOffset == 0) {

      VOffsetReg = B.buildConstant(S32, 0).getReg(0);

      B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);

      SOffsetReg = Base;

      InstOffsetVal = ImmOffset;

      return 0; // XXX - Why is this 0?

    }

  }


  // Handle the variable sgpr + vgpr case.

  MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);

  if (Add && (int)Offset >= 0) {

    Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);

    Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);


    const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);

    const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);


    if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {

      VOffsetReg = Src0;

      SOffsetReg = Src1;

      return 0;

    }


    if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {

      VOffsetReg = Src1;

      SOffsetReg = Src0;

      return 0;

    }

  }


  // Ensure we have a VGPR for the combined offset. This could be an issue if we

  // have an SGPR offset and a VGPR resource.

  if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {

    VOffsetReg = CombinedOffset;

  } else {

    VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);

    B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);

  }


  SOffsetReg = B.buildConstant(S32, 0).getReg(0);

  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);

  return 0;

}


static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {

  switch (Opc) {

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:

    return AMDGPU::G_AMDGPU_BUFFER_LOAD;

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:

    return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:

    return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:

    return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:

    return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;

  default:

    break;

  }

  llvm_unreachable("Unexpected s_buffer_load opcode");

}


bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(

    MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {

  MachineInstr &MI = OpdMapper.getMI();

  MachineRegisterInfo &MRI = OpdMapper.getMRI();


  const LLT S32 = LLT::scalar(32);

  Register Dst = MI.getOperand(0).getReg();

  LLT Ty = MRI.getType(Dst);


  const RegisterBank *RSrcBank =

    OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;

  const RegisterBank *OffsetBank =

    OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;

  if (RSrcBank == &AMDGPU::SGPRRegBank &&

      OffsetBank == &AMDGPU::SGPRRegBank)

    return true; // Legal mapping


  // FIXME: 96-bit case was widened during legalize. We need to narrow it back

  // here but don't have an MMO.


  unsigned LoadSize = Ty.getSizeInBits();

  int NumLoads = 1;

  if (LoadSize == 256 || LoadSize == 512) {

    NumLoads = LoadSize / 128;

    Ty = Ty.divide(NumLoads);

  }


  // Use the alignment to ensure that the required offsets will fit into the

  // immediate offsets.

  const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);


  MachineFunction &MF = B.getMF();


  Register SOffset;

  Register VOffset;

  int64_t ImmOffset = 0;


  unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,

                                        SOffset, ImmOffset, Alignment);


  // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we

  // can, but we need to track an MMO for that.

  const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;

  const Align MemAlign(4); // FIXME: ABI type alignment?

  MachineMemOperand *BaseMMO = MF.getMachineMemOperand(

    MachinePointerInfo(),

    MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |

    MachineMemOperand::MOInvariant,

    MemSize, MemAlign);

  if (MMOOffset != 0)

    BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);


  // If only the offset is divergent, emit a MUBUF buffer load instead. We can

  // assume that the buffer is unswizzled.


  Register RSrc = MI.getOperand(1).getReg();

  Register VIndex = B.buildConstant(S32, 0).getReg(0);

  B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);


  SmallVector<Register, 4> LoadParts(NumLoads);


  MachineBasicBlock::iterator MII = MI.getIterator();

  MachineInstrSpan Span(MII, &B.getMBB());


  for (int i = 0; i < NumLoads; ++i) {

    if (NumLoads == 1) {

      LoadParts[i] = Dst;

    } else {

      LoadParts[i] = MRI.createGenericVirtualRegister(Ty);

      MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);

    }


    MachineMemOperand *MMO = BaseMMO;

    if (i != 0)

      BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);


    B.buildInstr(getSBufferLoadCorrespondingBufferLoadOpcode(MI.getOpcode()))

        .addDef(LoadParts[i])       // vdata

        .addUse(RSrc)               // rsrc

        .addUse(VIndex)             // vindex

        .addUse(VOffset)            // voffset

        .addUse(SOffset)            // soffset

        .addImm(ImmOffset + 16 * i) // offset(imm)

        .addImm(0)                  // cachepolicy, swizzled buffer(imm)

        .addImm(0)                  // idxen(imm)

        .addMemOperand(MMO);

  }


  // TODO: If only the resource is a VGPR, it may be better to execute the

  // scalar load in the waterfall loop if the resource is expected to frequently

  // be dynamically uniform.

  if (RSrcBank != &AMDGPU::SGPRRegBank) {

    // Remove the original instruction to avoid potentially confusing the

    // waterfall loop logic.

    B.setInstr(*Span.begin());

    MI.eraseFromParent();


    SmallSet<Register, 4> OpsToWaterfall;


    OpsToWaterfall.insert(RSrc);

    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),

                           OpsToWaterfall);

  }


  if (NumLoads != 1) {

    if (Ty.isVector())

      B.buildConcatVectors(Dst, LoadParts);

    else

      B.buildMergeLikeInstr(Dst, LoadParts);

  }


  // We removed the instruction earlier with a waterfall loop.

  if (RSrcBank == &AMDGPU::SGPRRegBank)

    MI.eraseFromParent();


  return true;

}


bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,

                                             const OperandsMapper &OpdMapper,

                                             bool Signed) const {

  MachineInstr &MI = OpdMapper.getMI();

  MachineRegisterInfo &MRI = OpdMapper.getMRI();


  // Insert basic copies

  applyDefaultMapping(OpdMapper);


  Register DstReg = MI.getOperand(0).getReg();

  LLT Ty = MRI.getType(DstReg);


  const LLT S32 = LLT::scalar(32);


  unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;

  Register SrcReg = MI.getOperand(FirstOpnd).getReg();

  Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();

  Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();


  const RegisterBank *DstBank =

    OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

  if (DstBank == &AMDGPU::VGPRRegBank) {

    if (Ty == S32)

      return true;


    // There is no 64-bit vgpr bitfield extract instructions so the operation

    // is expanded to a sequence of instructions that implement the operation.

    ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);


    const LLT S64 = LLT::scalar(64);

    // Shift the source operand so that extracted bits start at bit 0.

    auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)

                              : B.buildLShr(S64, SrcReg, OffsetReg);

    auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);


    // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions

    // if the width is a constant.

    if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {

      // Use the 32-bit bitfield extract instruction if the width is a constant.

      // Depending on the width size, use either the low or high 32-bits.

      auto Zero = B.buildConstant(S32, 0);

      auto WidthImm = ConstWidth->Value.getZExtValue();

      if (WidthImm <= 32) {

        // Use bitfield extract on the lower 32-bit source, and then sign-extend

        // or clear the upper 32-bits.

        auto Extract =

            Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)

                   : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);

        auto Extend =

            Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;

        B.buildMergeLikeInstr(DstReg, {Extract, Extend});

      } else {

        // Use bitfield extract on upper 32-bit source, and combine with lower

        // 32-bit source.

        auto UpperWidth = B.buildConstant(S32, WidthImm - 32);

        auto Extract =

            Signed

                ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)

                : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);

        B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});

      }

      MI.eraseFromParent();

      return true;

    }


    // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit

    // operations.

    auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);

    auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);

    if (Signed)

      B.buildAShr(S64, SignBit, ExtShift);

    else

      B.buildLShr(S64, SignBit, ExtShift);

    MI.eraseFromParent();

    return true;

  }


  // The scalar form packs the offset and width in a single operand.


  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);


  // Ensure the high bits are clear to insert the offset.

  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));

  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);


  // Zeros out the low bits, so don't bother clamping the input value.

  auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));


  // Transformation function, pack the offset and width of a BFE into

  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second

  // source, bits [5:0] contain the offset and bits [22:16] the width.

  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);


  // TODO: It might be worth using a pseudo here to avoid scc clobber and

  // register class constraints.

  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :

                             (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);


  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});

  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))

    llvm_unreachable("failed to constrain BFE");


  MI.eraseFromParent();

  return true;

}


bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(

    MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {

  MachineInstr &MI = OpdMapper.getMI();

  MachineRegisterInfo &MRI = OpdMapper.getMRI();


  // Insert basic copies.

  applyDefaultMapping(OpdMapper);


  Register Dst0 = MI.getOperand(0).getReg();

  Register Dst1 = MI.getOperand(1).getReg();

  Register Src0 = MI.getOperand(2).getReg();

  Register Src1 = MI.getOperand(3).getReg();

  Register Src2 = MI.getOperand(4).getReg();


  if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)

    return true;


  bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;

  LLT S1 = LLT::scalar(1);

  LLT S32 = LLT::scalar(32);


  bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;

  bool Accumulate = true;


  if (!DstOnValu) {

    if (mi_match(Src2, MRI, m_ZeroInt()))

      Accumulate = false;

  }


  // Keep the multiplication on the SALU.

  Register DstHi;

  Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);

  bool MulHiInVgpr = false;


  MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);


  if (Subtarget.hasSMulHi()) {

    DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)

                       : B.buildSMulH(S32, Src0, Src1).getReg(0);

    MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);

  } else {

    Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);

    Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);


    MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);

    MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);


    DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)

                       : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);

    MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);


    if (!DstOnValu) {

      DstHi = buildReadFirstLane(B, MRI, DstHi);

    } else {

      MulHiInVgpr = true;

    }

  }


  // Accumulate and produce the "carry-out" bit.

  //

  // The "carry-out" is defined as bit 64 of the result when computed as a

  // big integer. For unsigned multiply-add, this matches the usual definition

  // of carry-out. For signed multiply-add, bit 64 is the sign bit of the

  // result, which is determined as:

  //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add

  LLT CarryType = DstOnValu ? S1 : S32;

  const RegisterBank &CarryBank =

      DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;

  const RegisterBank &DstBank =

      DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;

  Register Carry;

  Register Zero;


  if (!IsUnsigned) {

    Zero = B.buildConstant(S32, 0).getReg(0);

    MRI.setRegBank(Zero,

                   MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);


    Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)

                .getReg(0);

    MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank

                                      : AMDGPU::SGPRRegBank);


    if (DstOnValu && !MulHiInVgpr) {

      Carry = B.buildTrunc(S1, Carry).getReg(0);

      MRI.setRegBank(Carry, AMDGPU::VCCRegBank);

    }

  }


  if (Accumulate) {

    if (DstOnValu) {

      DstLo = B.buildCopy(S32, DstLo).getReg(0);

      DstHi = B.buildCopy(S32, DstHi).getReg(0);

      MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);

      MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);

    }


    auto Unmerge = B.buildUnmerge(S32, Src2);

    Register Src2Lo = Unmerge.getReg(0);

    Register Src2Hi = Unmerge.getReg(1);

    MRI.setRegBank(Src2Lo, DstBank);

    MRI.setRegBank(Src2Hi, DstBank);


    if (!IsUnsigned) {

      auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);

      MRI.setRegBank(Src2Sign.getReg(0), CarryBank);


      Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);

      MRI.setRegBank(Carry, CarryBank);

    }


    auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);

    DstLo = AddLo.getReg(0);

    Register CarryLo = AddLo.getReg(1);

    MRI.setRegBank(DstLo, DstBank);

    MRI.setRegBank(CarryLo, CarryBank);


    auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);

    DstHi = AddHi.getReg(0);

    MRI.setRegBank(DstHi, DstBank);


    Register CarryHi = AddHi.getReg(1);

    MRI.setRegBank(CarryHi, CarryBank);


    if (IsUnsigned) {

      Carry = CarryHi;

    } else {

      Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);

      MRI.setRegBank(Carry, CarryBank);

    }

  } else {

    if (IsUnsigned) {

      Carry = B.buildConstant(CarryType, 0).getReg(0);

      MRI.setRegBank(Carry, CarryBank);

    }

  }


  B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});


  if (DstOnValu) {

    B.buildCopy(Dst1, Carry);

  } else {

    B.buildTrunc(Dst1, Carry);

  }


  MI.eraseFromParent();

  return true;

}


// Return a suitable opcode for extending the operands of Opc when widening.

static unsigned getExtendOp(unsigned Opc) {

  switch (Opc) {

  case TargetOpcode::G_ASHR:

  case TargetOpcode::G_SMIN:

  case TargetOpcode::G_SMAX:

    return TargetOpcode::G_SEXT;

  case TargetOpcode::G_LSHR:

  case TargetOpcode::G_UMIN:

  case TargetOpcode::G_UMAX:

    return TargetOpcode::G_ZEXT;

  default:

    return TargetOpcode::G_ANYEXT;

  }

}


// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding

// any illegal vector extend or unmerge operations.

static std::pair<Register, Register>

unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {

  const LLT S32 = LLT::scalar(32);

  auto Bitcast = B.buildBitcast(S32, Src);


  if (ExtOpcode == TargetOpcode::G_SEXT) {

    auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);

    auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));

    return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));

  }


  auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));

  if (ExtOpcode == TargetOpcode::G_ZEXT) {

    auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));

    return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));

  }


  assert(ExtOpcode == TargetOpcode::G_ANYEXT);

  return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));

}


// For cases where only a single copy is inserted for matching register banks.

// Replace the register in the instruction operand

static bool substituteSimpleCopyRegs(

  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {

  SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));

  if (!SrcReg.empty()) {

    assert(SrcReg.size() == 1);

    OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);

    return true;

  }


  return false;

}


/// Handle register layout difference for f16 images for some subtargets.

Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,

                                                MachineRegisterInfo &MRI,

                                                Register Reg) const {

  if (!Subtarget.hasUnpackedD16VMem())

    return Reg;


  const LLT S16 = LLT::scalar(16);

  LLT StoreVT = MRI.getType(Reg);

  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)

    return Reg;


  auto Unmerge = B.buildUnmerge(S16, Reg);


  SmallVector<Register, 4> WideRegs;

  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)

    WideRegs.push_back(Unmerge.getReg(I));


  const LLT S32 = LLT::scalar(32);

  int NumElts = StoreVT.getNumElements();


  return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)

      .getReg(0);

}


static std::pair<Register, unsigned>

getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {

  int64_t Const;

  if (mi_match(Reg, MRI, m_ICst(Const)))

    return std::pair(Register(), Const);


  Register Base;

  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))

    return std::pair(Base, Const);


  // TODO: Handle G_OR used for add case

  return std::pair(Reg, 0);

}


std::pair<Register, unsigned>

AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,

                                           Register OrigOffset) const {

  const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);

  Register BaseReg;

  unsigned ImmOffset;

  const LLT S32 = LLT::scalar(32);


  // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.

  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),

                                                           OrigOffset);


  unsigned C1 = 0;

  if (ImmOffset != 0) {

    // If the immediate value is too big for the immoffset field, put only bits

    // that would normally fit in the immoffset field. The remaining value that

    // is copied/added for the voffset field is a large power of 2, and it

    // stands more chance of being CSEd with the copy/add for another similar

    // load/store.

    // However, do not do that rounding down if that is a negative

    // number, as it appears to be illegal to have a negative offset in the

    // vgpr, even if adding the immediate offset makes it positive.

    unsigned Overflow = ImmOffset & ~MaxImm;

    ImmOffset -= Overflow;

    if ((int32_t)Overflow < 0) {

      Overflow += ImmOffset;

      ImmOffset = 0;

    }


    C1 = ImmOffset;

    if (Overflow != 0) {

      if (!BaseReg)

        BaseReg = B.buildConstant(S32, Overflow).getReg(0);

      else {

        auto OverflowVal = B.buildConstant(S32, Overflow);

        BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);

      }

    }

  }


  if (!BaseReg)

    BaseReg = B.buildConstant(S32, 0).getReg(0);


  return {BaseReg, C1};

}


bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,

                                        Register SrcReg) const {

  MachineRegisterInfo &MRI = *B.getMRI();

  LLT SrcTy = MRI.getType(SrcReg);

  if (SrcTy.getSizeInBits() == 32) {

    // Use a v_mov_b32 here to make the exec dependency explicit.

    B.buildInstr(AMDGPU::V_MOV_B32_e32)

      .addDef(DstReg)

      .addUse(SrcReg);

    return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&

           constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);

  }


  Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  B.buildInstr(AMDGPU::V_MOV_B32_e32)

    .addDef(TmpReg0)

    .addUse(SrcReg, 0, AMDGPU::sub0);

  B.buildInstr(AMDGPU::V_MOV_B32_e32)

    .addDef(TmpReg1)

    .addUse(SrcReg, 0, AMDGPU::sub1);

  B.buildInstr(AMDGPU::REG_SEQUENCE)

    .addDef(DstReg)

    .addUse(TmpReg0)

    .addImm(AMDGPU::sub0)

    .addUse(TmpReg1)

    .addImm(AMDGPU::sub1);


  return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&

         constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);

}


/// Utility function for pushing dynamic vector indexes with a constant offset

/// into waterfall loops.

static void reinsertVectorIndexAdd(MachineIRBuilder &B,

                                   MachineInstr &IdxUseInstr,

                                   unsigned OpIdx,

                                   unsigned ConstOffset) {

  MachineRegisterInfo &MRI = *B.getMRI();

  const LLT S32 = LLT::scalar(32);

  Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();

  B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());


  auto MaterializedOffset = B.buildConstant(S32, ConstOffset);


  auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);

  MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);

  MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);

  IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));

}


/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the

/// original 32-bit source value (to be inserted in the low part of the combined

/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit

/// value.

static void extendLow32IntoHigh32(MachineIRBuilder &B,

                                  Register Hi32Reg, Register Lo32Reg,

                                  unsigned ExtOpc,

                                  const RegisterBank &RegBank,

                                  bool IsBooleanSrc = false) {

  if (ExtOpc == AMDGPU::G_ZEXT) {

    B.buildConstant(Hi32Reg, 0);

  } else if (ExtOpc == AMDGPU::G_SEXT) {

    if (IsBooleanSrc) {

      // If we know the original source was an s1, the high half is the same as

      // the low.

      B.buildCopy(Hi32Reg, Lo32Reg);

    } else {

      // Replicate sign bit from 32-bit extended part.

      auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);

      B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);

      B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);

    }

  } else {

    assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");

    B.buildUndef(Hi32Reg);

  }

}


bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(

    MachineIRBuilder &B, MachineInstr &MI,

    const OperandsMapper &OpdMapper) const {

  MachineRegisterInfo &MRI = *B.getMRI();


  Register VecReg = MI.getOperand(1).getReg();

  Register Idx = MI.getOperand(2).getReg();


  const RegisterBank &IdxBank =

    *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;


  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;


  LLT VecTy = MRI.getType(VecReg);

  unsigned EltSize = VecTy.getScalarSizeInBits();

  unsigned NumElem = VecTy.getNumElements();


  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,

                                                  IsDivergentIdx, &Subtarget))

    return false;


  LLT S32 = LLT::scalar(32);


  const RegisterBank &DstBank =

    *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

  const RegisterBank &SrcBank =

    *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;


  const RegisterBank &CCBank =

    (DstBank == AMDGPU::SGPRRegBank &&

     SrcBank == AMDGPU::SGPRRegBank &&

     IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank

                                     : AMDGPU::VCCRegBank;

  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);


  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {

    Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();

    MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);

  }


  LLT EltTy = VecTy.getScalarType();

  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));

  unsigned NumLanes = DstRegs.size();

  if (!NumLanes)

    NumLanes = 1;

  else

    EltTy = MRI.getType(DstRegs[0]);


  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);

  SmallVector<Register, 2> Res(NumLanes);

  for (unsigned L = 0; L < NumLanes; ++L)

    Res[L] = UnmergeToEltTy.getReg(L);


  for (unsigned I = 1; I < NumElem; ++I) {

    auto IC = B.buildConstant(S32, I);

    MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);

    auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);

    MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);


    for (unsigned L = 0; L < NumLanes; ++L) {

      auto S = B.buildSelect(EltTy, Cmp,

                             UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);


      for (unsigned N : { 0, 2, 3 })

        MRI.setRegBank(S->getOperand(N).getReg(), DstBank);


      Res[L] = S->getOperand(0).getReg();

    }

  }


  for (unsigned L = 0; L < NumLanes; ++L) {

    Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];

    B.buildCopy(DstReg, Res[L]);

    MRI.setRegBank(DstReg, DstBank);

  }


  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);

  MI.eraseFromParent();


  return true;

}


// Insert a cross regbank copy for a register if it already has a bank that

// differs from the one we want to set.

static Register constrainRegToBank(MachineRegisterInfo &MRI,

                                   MachineIRBuilder &B, Register &Reg,

                                   const RegisterBank &Bank) {

  const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);

  if (CurrBank && *CurrBank != Bank) {

    Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);

    MRI.setRegBank(Copy, Bank);

    return Copy;

  }


  MRI.setRegBank(Reg, Bank);

  return Reg;

}


bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(

    MachineIRBuilder &B, MachineInstr &MI,

    const OperandsMapper &OpdMapper) const {


  MachineRegisterInfo &MRI = *B.getMRI();

  Register VecReg = MI.getOperand(1).getReg();

  Register Idx = MI.getOperand(3).getReg();


  const RegisterBank &IdxBank =

    *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;


  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;


  LLT VecTy = MRI.getType(VecReg);

  unsigned EltSize = VecTy.getScalarSizeInBits();

  unsigned NumElem = VecTy.getNumElements();


  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,

                                                  IsDivergentIdx, &Subtarget))

    return false;


  LLT S32 = LLT::scalar(32);


  const RegisterBank &DstBank =

    *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

  const RegisterBank &SrcBank =

    *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;

  const RegisterBank &InsBank =

    *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;


  const RegisterBank &CCBank =

    (DstBank == AMDGPU::SGPRRegBank &&

     SrcBank == AMDGPU::SGPRRegBank &&

     InsBank == AMDGPU::SGPRRegBank &&

     IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank

                                     : AMDGPU::VCCRegBank;

  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);


  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {

    Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();

    MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);

  }


  LLT EltTy = VecTy.getScalarType();

  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));

  unsigned NumLanes = InsRegs.size();

  if (!NumLanes) {

    NumLanes = 1;

    InsRegs.push_back(MI.getOperand(2).getReg());

  } else {

    EltTy = MRI.getType(InsRegs[0]);

  }


  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);

  SmallVector<Register, 16> Ops(NumElem * NumLanes);


  for (unsigned I = 0; I < NumElem; ++I) {

    auto IC = B.buildConstant(S32, I);

    MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);

    auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);

    MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);


    for (unsigned L = 0; L < NumLanes; ++L) {

      Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);

      Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);

      Op1 = constrainRegToBank(MRI, B, Op1, DstBank);


      Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);

      MRI.setRegBank(Select, DstBank);


      Ops[I * NumLanes + L] = Select;

    }

  }


  LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);

  if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {

    B.buildBuildVector(MI.getOperand(0), Ops);

  } else {

    auto Vec = B.buildBuildVector(MergeTy, Ops);

    MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);

    B.buildBitcast(MI.getOperand(0).getReg(), Vec);

  }


  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);

  MI.eraseFromParent();


  return true;

}


// Break s_mul_u64 into 32-bit vector operations.

void AMDGPURegisterBankInfo::applyMappingSMULU64(

    MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {

  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));

  SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));

  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));


  // All inputs are SGPRs, nothing special to do.

  if (DefRegs.empty()) {

    assert(Src0Regs.empty() && Src1Regs.empty());

    applyDefaultMapping(OpdMapper);

    return;

  }


  assert(DefRegs.size() == 2);

  assert(Src0Regs.size() == Src1Regs.size() &&

         (Src0Regs.empty() || Src0Regs.size() == 2));


  MachineRegisterInfo &MRI = OpdMapper.getMRI();

  MachineInstr &MI = OpdMapper.getMI();

  Register DstReg = MI.getOperand(0).getReg();

  LLT HalfTy = LLT::scalar(32);


  // Depending on where the source registers came from, the generic code may

  // have decided to split the inputs already or not. If not, we still need to

  // extract the values.


  if (Src0Regs.empty())

    split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());

  else

    setRegsToType(MRI, Src0Regs, HalfTy);


  if (Src1Regs.empty())

    split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());

  else

    setRegsToType(MRI, Src1Regs, HalfTy);


  setRegsToType(MRI, DefRegs, HalfTy);


  // The multiplication is done as follows:

  //

  //                            Op1H  Op1L

  //                          * Op0H  Op0L

  //                       --------------------

  //                       Op1H*Op0L  Op1L*Op0L

  //          + Op1H*Op0H  Op1L*Op0H

  // -----------------------------------------

  // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L

  //

  //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit

  //  value and that would overflow.

  //  The low 32-bit value is Op1L*Op0L.

  //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from

  //  Op1L*Op0L).


  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);


  Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);

  Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);

  Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);

  Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);

  B.buildAdd(DefRegs[1], Add, MulHiLo);

  B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);


  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);

  MI.eraseFromParent();

}


void AMDGPURegisterBankInfo::applyMappingImpl(

    MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {

  MachineInstr &MI = OpdMapper.getMI();

  B.setInstrAndDebugLoc(MI);

  unsigned Opc = MI.getOpcode();

  MachineRegisterInfo &MRI = OpdMapper.getMRI();

  switch (Opc) {

  case AMDGPU::G_CONSTANT:

  case AMDGPU::G_IMPLICIT_DEF: {

    Register DstReg = MI.getOperand(0).getReg();

    LLT DstTy = MRI.getType(DstReg);

    if (DstTy != LLT::scalar(1))

      break;


    const RegisterBank *DstBank =

        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (DstBank == &AMDGPU::VCCRegBank)

      break;

    SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));

    if (DefRegs.empty())

      DefRegs.push_back(DstReg);


    B.setInsertPt(*MI.getParent(), ++MI.getIterator());


    Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));

    LLVMContext &Ctx = B.getMF().getFunction().getContext();


    MI.getOperand(0).setReg(NewDstReg);

    if (Opc != AMDGPU::G_IMPLICIT_DEF) {

      uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();

      MI.getOperand(1).setCImm(

          ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));

    }


    MRI.setRegBank(NewDstReg, *DstBank);

    B.buildTrunc(DefRegs[0], NewDstReg);

    return;

  }

  case AMDGPU::G_PHI: {

    Register DstReg = MI.getOperand(0).getReg();

    LLT DstTy = MRI.getType(DstReg);

    if (DstTy != LLT::scalar(1))

      break;


    const LLT S32 = LLT::scalar(32);

    const RegisterBank *DstBank =

      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (DstBank == &AMDGPU::VCCRegBank) {

      applyDefaultMapping(OpdMapper);

      // The standard handling only considers the result register bank for

      // phis. For VCC, blindly inserting a copy when the phi is lowered will

      // produce an invalid copy. We can only copy with some kind of compare to

      // get a vector boolean result. Insert a register bank copy that will be

      // correctly lowered to a compare.

      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {

        Register SrcReg = MI.getOperand(I).getReg();

        const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);


        if (SrcBank != &AMDGPU::VCCRegBank) {

          MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();

          B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());


          auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);

          MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);

          MI.getOperand(I).setReg(Copy.getReg(0));

        }

      }


      return;

    }


    // Phi handling is strange and only considers the bank of the destination.

    substituteSimpleCopyRegs(OpdMapper, 0);


    // Promote SGPR/VGPR booleans to s32

    ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);

    B.setInsertPt(B.getMBB(), MI);

    LegalizerHelper Helper(B.getMF(), ApplyBank, B);


    if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)

      llvm_unreachable("widen scalar should have succeeded");


    return;

  }

  case AMDGPU::G_FCMP:

    if (!Subtarget.hasSALUFloatInsts())

      break;

    [[fallthrough]];

  case AMDGPU::G_ICMP:

  case AMDGPU::G_UADDO:

  case AMDGPU::G_USUBO:

  case AMDGPU::G_UADDE:

  case AMDGPU::G_SADDE:

  case AMDGPU::G_USUBE:

  case AMDGPU::G_SSUBE: {

    unsigned BoolDstOp =

        (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;

    Register DstReg = MI.getOperand(BoolDstOp).getReg();


    const RegisterBank *DstBank =

      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (DstBank != &AMDGPU::SGPRRegBank)

      break;


    const bool HasCarryIn = MI.getNumOperands() == 5;


    // If this is a scalar compare, promote the result to s32, as the selection

    // will end up using a copy to a 32-bit vreg.

    const LLT S32 = LLT::scalar(32);

    Register NewDstReg = MRI.createGenericVirtualRegister(S32);

    MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);

    MI.getOperand(BoolDstOp).setReg(NewDstReg);


    if (HasCarryIn) {

      Register NewSrcReg = MRI.createGenericVirtualRegister(S32);

      MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);

      B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());

      MI.getOperand(4).setReg(NewSrcReg);

    }


    MachineBasicBlock *MBB = MI.getParent();

    B.setInsertPt(*MBB, std::next(MI.getIterator()));


    // If we had a constrained VCC result register, a copy was inserted to VCC

    // from SGPR.

    SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));

    if (DefRegs.empty())

      DefRegs.push_back(DstReg);

    B.buildTrunc(DefRegs[0], NewDstReg);

    return;

  }

  case AMDGPU::G_SELECT: {

    Register DstReg = MI.getOperand(0).getReg();

    LLT DstTy = MRI.getType(DstReg);


    SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));

    if (CondRegs.empty())

      CondRegs.push_back(MI.getOperand(1).getReg());

    else {

      assert(CondRegs.size() == 1);

    }


    const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);

    if (CondBank == &AMDGPU::SGPRRegBank) {

      const LLT S32 = LLT::scalar(32);

      Register NewCondReg = MRI.createGenericVirtualRegister(S32);

      MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);


      MI.getOperand(1).setReg(NewCondReg);

      B.buildZExt(NewCondReg, CondRegs[0]);

    }


    if (DstTy.getSizeInBits() != 64)

      break;


    LLT HalfTy = getHalfSizedType(DstTy);


    SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));

    SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));

    SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));


    // All inputs are SGPRs, nothing special to do.

    if (DefRegs.empty()) {

      assert(Src1Regs.empty() && Src2Regs.empty());

      break;

    }


    if (Src1Regs.empty())

      split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());

    else {

      setRegsToType(MRI, Src1Regs, HalfTy);

    }


    if (Src2Regs.empty())

      split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());

    else

      setRegsToType(MRI, Src2Regs, HalfTy);


    setRegsToType(MRI, DefRegs, HalfTy);


    auto Flags = MI.getFlags();

    B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0], Flags);

    B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1], Flags);


    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);

    MI.eraseFromParent();

    return;

  }

  case AMDGPU::G_BRCOND: {

    Register CondReg = MI.getOperand(0).getReg();

    // FIXME: Should use legalizer helper, but should change bool ext type.

    const RegisterBank *CondBank =

      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;


    if (CondBank == &AMDGPU::SGPRRegBank) {

      const LLT S32 = LLT::scalar(32);

      Register NewCondReg = MRI.createGenericVirtualRegister(S32);

      MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);


      MI.getOperand(0).setReg(NewCondReg);

      B.buildZExt(NewCondReg, CondReg);

      return;

    }


    break;

  }

  case AMDGPU::G_AND:

  case AMDGPU::G_OR:

  case AMDGPU::G_XOR: {

    // 64-bit and is only available on the SALU, so split into 2 32-bit ops if

    // there is a VGPR input.

    Register DstReg = MI.getOperand(0).getReg();

    LLT DstTy = MRI.getType(DstReg);


    const RegisterBank *DstBank =

        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;


    if (DstTy.getSizeInBits() == 1) {

      if (DstBank == &AMDGPU::VCCRegBank)

        break;


      MachineFunction *MF = MI.getParent()->getParent();

      ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);

      LegalizerHelper Helper(*MF, ApplyBank, B);


      if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=

          LegalizerHelper::Legalized)

        llvm_unreachable("widen scalar should have succeeded");

      return;

    }


    if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {

      const LLT S32 = LLT::scalar(32);

      MachineBasicBlock *MBB = MI.getParent();

      MachineFunction *MF = MBB->getParent();

      ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);

      LegalizerHelper Helper(*MF, ApplySALU, B);

      // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening

      // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1

      // as "not".

      if (MI.getOpcode() == AMDGPU::G_XOR &&

          mi_match(MI.getOperand(2).getReg(), MRI, m_SpecificICstOrSplat(-1))) {

        Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);

        Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_SEXT);

        Helper.widenScalarDst(MI, S32);

      } else {

        if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)

          llvm_unreachable("widen scalar should have succeeded");

      }

      return;

    }


    if (DstTy.getSizeInBits() != 64)

      break;


    LLT HalfTy = getHalfSizedType(DstTy);

    SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));

    SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));

    SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));


    // All inputs are SGPRs, nothing special to do.

    if (DefRegs.empty()) {

      assert(Src0Regs.empty() && Src1Regs.empty());

      break;

    }


    assert(DefRegs.size() == 2);

    assert(Src0Regs.size() == Src1Regs.size() &&

           (Src0Regs.empty() || Src0Regs.size() == 2));


    // Depending on where the source registers came from, the generic code may

    // have decided to split the inputs already or not. If not, we still need to

    // extract the values.


    if (Src0Regs.empty())

      split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());

    else

      setRegsToType(MRI, Src0Regs, HalfTy);


    if (Src1Regs.empty())

      split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());

    else

      setRegsToType(MRI, Src1Regs, HalfTy);


    setRegsToType(MRI, DefRegs, HalfTy);


    auto Flags = MI.getFlags();

    B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}, Flags);

    B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}, Flags);


    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);

    MI.eraseFromParent();

    return;

  }

  case AMDGPU::G_ABS: {

    Register SrcReg = MI.getOperand(1).getReg();

    const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);


    // There is no VALU abs instruction so we need to replace it with a sub and

    // max combination.

    if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {

      MachineFunction *MF = MI.getParent()->getParent();

      ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);

      LegalizerHelper Helper(*MF, Apply, B);


      if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)

        llvm_unreachable("lowerAbsToMaxNeg should have succeeded");

      return;

    }

    [[fallthrough]];

  }

  case AMDGPU::G_ADD:

  case AMDGPU::G_SUB:

  case AMDGPU::G_MUL:

  case AMDGPU::G_SHL:

  case AMDGPU::G_LSHR:

  case AMDGPU::G_ASHR:

  case AMDGPU::G_SMIN:

  case AMDGPU::G_SMAX:

  case AMDGPU::G_UMIN:

  case AMDGPU::G_UMAX: {

    Register DstReg = MI.getOperand(0).getReg();

    LLT DstTy = MRI.getType(DstReg);


    // Special case for s_mul_u64. There is not a vector equivalent of

    // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector

    // multiplications.

    if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&

        DstTy.getSizeInBits() == 64) {

      applyMappingSMULU64(B, OpdMapper);

      return;

    }


    // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.

    // Packed 16-bit operations need to be scalarized and promoted.

    if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))

      break;


    const RegisterBank *DstBank =

        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (DstBank == &AMDGPU::VGPRRegBank)

      break;


    const LLT S32 = LLT::scalar(32);

    MachineBasicBlock *MBB = MI.getParent();

    MachineFunction *MF = MBB->getParent();

    ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);


    if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {

      Register WideSrcLo, WideSrcHi;


      std::tie(WideSrcLo, WideSrcHi) =

          unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);

      auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});

      auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});

      B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});

      MI.eraseFromParent();

      return;

    }


    if (DstTy.isVector()) {

      Register WideSrc0Lo, WideSrc0Hi;

      Register WideSrc1Lo, WideSrc1Hi;


      unsigned ExtendOp = getExtendOp(MI.getOpcode());

      std::tie(WideSrc0Lo, WideSrc0Hi)

        = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);

      std::tie(WideSrc1Lo, WideSrc1Hi)

        = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);

      auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});

      auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});

      B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});

      MI.eraseFromParent();

    } else {

      LegalizerHelper Helper(*MF, ApplySALU, B);


      if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)

        llvm_unreachable("widen scalar should have succeeded");


      // FIXME: s16 shift amounts should be legal.

      if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||

          Opc == AMDGPU::G_ASHR) {

        B.setInsertPt(*MBB, MI.getIterator());

        if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)

          llvm_unreachable("widen scalar should have succeeded");

      }

    }


    return;

  }

  case AMDGPU::G_AMDGPU_S_MUL_I64_I32:

  case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {

    // This is a special case for s_mul_u64. We use

    // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation

    // where the 33 higher bits are sign-extended and

    // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation

    // where the 32 higher bits are zero-extended. In case scalar registers are

    // selected, both opcodes are lowered as s_mul_u64. If the vector registers

    // are selected, then G_AMDGPU_S_MUL_I64_I32 and

    // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.


    // Insert basic copies.

    applyDefaultMapping(OpdMapper);


    Register DstReg = MI.getOperand(0).getReg();

    Register SrcReg0 = MI.getOperand(1).getReg();

    Register SrcReg1 = MI.getOperand(2).getReg();

    const LLT S32 = LLT::scalar(32);

    const LLT S64 = LLT::scalar(64);

    assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "

                                         "that handles only 64-bit operands.");

    const RegisterBank *DstBank =

        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;


    // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32

    // with s_mul_u64 operation.

    if (DstBank == &AMDGPU::SGPRRegBank) {

      MI.setDesc(TII->get(AMDGPU::S_MUL_U64));

      MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);

      MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);

      MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);

      return;

    }


    // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32

    // with a vector mad.

    assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&

           "The destination operand should be in vector registers.");


    // Extract the lower subregister from the first operand.

    Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);

    MRI.setType(Op0L, S32);

    B.buildTrunc(Op0L, SrcReg0);


    // Extract the lower subregister from the second operand.

    Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);

    MRI.setType(Op1L, S32);

    B.buildTrunc(Op1L, SrcReg1);


    unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32

                          ? AMDGPU::G_AMDGPU_MAD_U64_U32

                          : AMDGPU::G_AMDGPU_MAD_I64_I32;


    MachineIRBuilder B(MI);

    Register Zero64 = B.buildConstant(S64, 0).getReg(0);

    MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);

    Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);

    MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);

    B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});

    MI.eraseFromParent();

    return;

  }

  case AMDGPU::G_SEXT_INREG: {

    SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));

    if (SrcRegs.empty())

      break; // Nothing to repair


    const LLT S32 = LLT::scalar(32);

    ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);


    // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs

    // we would need to further expand, and doesn't let us directly set the

    // result registers.

    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));


    int Amt = MI.getOperand(2).getImm();

    if (Amt <= 32) {

      // Downstream users have expectations for the high bit behavior, so freeze

      // incoming undefined bits.

      if (Amt == 32) {

        // The low bits are unchanged.

        B.buildFreeze(DstRegs[0], SrcRegs[0]);

      } else {

        auto Freeze = B.buildFreeze(S32, SrcRegs[0]);

        // Extend in the low bits and propagate the sign bit to the high half.

        B.buildSExtInReg(DstRegs[0], Freeze, Amt);

      }


      B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));

    } else {

      // The low bits are unchanged, and extend in the high bits.

      // No freeze required

      B.buildCopy(DstRegs[0], SrcRegs[0]);

      B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);

    }


    Register DstReg = MI.getOperand(0).getReg();

    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);

    MI.eraseFromParent();

    return;

  }

  case AMDGPU::G_CTPOP:

  case AMDGPU::G_BITREVERSE: {

    const RegisterBank *DstBank =

      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (DstBank == &AMDGPU::SGPRRegBank)

      break;


    Register SrcReg = MI.getOperand(1).getReg();

    const LLT S32 = LLT::scalar(32);

    LLT Ty = MRI.getType(SrcReg);

    if (Ty == S32)

      break;


    ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);


    MachineFunction &MF = B.getMF();

    LegalizerHelper Helper(MF, ApplyVALU, B);


    if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)

      llvm_unreachable("narrowScalar should have succeeded");

    return;

  }

  case AMDGPU::G_AMDGPU_FFBH_U32:

  case AMDGPU::G_AMDGPU_FFBL_B32:

  case AMDGPU::G_CTLZ_ZERO_UNDEF:

  case AMDGPU::G_CTTZ_ZERO_UNDEF: {

    const RegisterBank *DstBank =

        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (DstBank == &AMDGPU::SGPRRegBank)

      break;


    Register SrcReg = MI.getOperand(1).getReg();

    const LLT S32 = LLT::scalar(32);

    LLT Ty = MRI.getType(SrcReg);

    if (Ty == S32)

      break;


    // We can narrow this more efficiently than Helper can by using ffbh/ffbl

    // which return -1 when the input is zero:

    // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))

    // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))

    // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))

    // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))

    ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);

    SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));

    unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF

                          ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32

                          : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF

                                ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32

                                : Opc;

    unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;

    auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});

    auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});

    unsigned AddOpc =

        Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF

            ? AMDGPU::G_ADD

            : AMDGPU::G_UADDSAT;

    Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});

    Register DstReg = MI.getOperand(0).getReg();

    B.buildUMin(DstReg, X, Y);

    MI.eraseFromParent();

    return;

  }

  case AMDGPU::G_SEXT:

  case AMDGPU::G_ZEXT:

  case AMDGPU::G_ANYEXT: {

    Register SrcReg = MI.getOperand(1).getReg();

    LLT SrcTy = MRI.getType(SrcReg);

    const bool Signed = Opc == AMDGPU::G_SEXT;


    assert(OpdMapper.getVRegs(1).empty());


    const RegisterBank *SrcBank =

      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;


    Register DstReg = MI.getOperand(0).getReg();

    LLT DstTy = MRI.getType(DstReg);

    if (DstTy.isScalar() &&

        SrcBank != &AMDGPU::SGPRRegBank &&

        SrcBank != &AMDGPU::VCCRegBank &&

        // FIXME: Should handle any type that round to s64 when irregular

        // breakdowns supported.

        DstTy.getSizeInBits() == 64 &&

        SrcTy.getSizeInBits() <= 32) {

      SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));


      // Extend to 32-bit, and then extend the low half.

      if (Signed) {

        // TODO: Should really be buildSExtOrCopy

        B.buildSExtOrTrunc(DefRegs[0], SrcReg);

      } else if (Opc == AMDGPU::G_ZEXT) {

        B.buildZExtOrTrunc(DefRegs[0], SrcReg);

      } else {

        B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);

      }


      extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);

      MRI.setRegBank(DstReg, *SrcBank);

      MI.eraseFromParent();

      return;

    }


    if (SrcTy != LLT::scalar(1))

      return;


    // It is not legal to have a legalization artifact with a VCC source. Rather

    // than introducing a copy, insert the select we would have to select the

    // copy to.

    if (SrcBank == &AMDGPU::VCCRegBank) {

      SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));


      const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;


      unsigned DstSize = DstTy.getSizeInBits();

      // 64-bit select is SGPR only

      const bool UseSel64 = DstSize > 32 &&

        SrcBank->getID() == AMDGPU::SGPRRegBankID;


      // TODO: Should s16 select be legal?

      LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);

      auto True = B.buildConstant(SelType, Signed ? -1 : 1);

      auto False = B.buildConstant(SelType, 0);


      MRI.setRegBank(True.getReg(0), *DstBank);

      MRI.setRegBank(False.getReg(0), *DstBank);

      MRI.setRegBank(DstReg, *DstBank);


      if (DstSize > 32) {

        B.buildSelect(DefRegs[0], SrcReg, True, False);

        extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);

      } else if (DstSize < 32) {

        auto Sel = B.buildSelect(SelType, SrcReg, True, False);

        MRI.setRegBank(Sel.getReg(0), *DstBank);

        B.buildTrunc(DstReg, Sel);

      } else {

        B.buildSelect(DstReg, SrcReg, True, False);

      }


      MI.eraseFromParent();

      return;

    }


    break;

  }

  case AMDGPU::G_EXTRACT_VECTOR_ELT: {

    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));


    assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());


    Register DstReg = MI.getOperand(0).getReg();

    Register SrcReg = MI.getOperand(1).getReg();


    const LLT S32 = LLT::scalar(32);

    LLT DstTy = MRI.getType(DstReg);

    LLT SrcTy = MRI.getType(SrcReg);


    if (foldExtractEltToCmpSelect(B, MI, OpdMapper))

      return;


    const ValueMapping &DstMapping

      = OpdMapper.getInstrMapping().getOperandMapping(0);

    const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;

    const RegisterBank *SrcBank =

      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;

    const RegisterBank *IdxBank =

        OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;


    Register BaseIdxReg;

    unsigned ConstOffset;

    std::tie(BaseIdxReg, ConstOffset) =

        AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());


    // See if the index is an add of a constant which will be foldable by moving

    // the base register of the index later if this is going to be executed in a

    // waterfall loop. This is essentially to reassociate the add of a constant

    // with the readfirstlane.

    bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&

                                   ConstOffset > 0 &&

                                   ConstOffset < SrcTy.getNumElements();


    // Move the base register. We'll re-insert the add later.

    if (ShouldMoveIndexIntoLoop)

      MI.getOperand(2).setReg(BaseIdxReg);


    // If this is a VGPR result only because the index was a VGPR result, the

    // actual indexing will be done on the SGPR source vector, which will

    // produce a scalar result. We need to copy to the VGPR result inside the

    // waterfall loop.

    const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&

                                SrcBank == &AMDGPU::SGPRRegBank;

    if (DstRegs.empty()) {

      applyDefaultMapping(OpdMapper);


      executeInWaterfallLoop(B, MI, {2});


      if (NeedCopyToVGPR) {

        // We don't want a phi for this temporary reg.

        Register TmpReg = MRI.createGenericVirtualRegister(DstTy);

        MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);

        MI.getOperand(0).setReg(TmpReg);

        B.setInsertPt(*MI.getParent(), ++MI.getIterator());


        // Use a v_mov_b32 here to make the exec dependency explicit.

        buildVCopy(B, DstReg, TmpReg);

      }


      // Re-insert the constant offset add inside the waterfall loop.

      if (ShouldMoveIndexIntoLoop)

        reinsertVectorIndexAdd(B, MI, 2, ConstOffset);


      return;

    }


    assert(DstTy.getSizeInBits() == 64);


    LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);


    auto CastSrc = B.buildBitcast(Vec32, SrcReg);

    auto One = B.buildConstant(S32, 1);


    MachineBasicBlock::iterator MII = MI.getIterator();


    // Split the vector index into 32-bit pieces. Prepare to move all of the

    // new instructions into a waterfall loop if necessary.

    //

    // Don't put the bitcast or constant in the loop.

    MachineInstrSpan Span(MII, &B.getMBB());


    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).

    auto IdxLo = B.buildShl(S32, BaseIdxReg, One);

    auto IdxHi = B.buildAdd(S32, IdxLo, One);


    auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);

    auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);


    MRI.setRegBank(DstReg, *DstBank);

    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);

    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);

    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);

    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);


    SmallSet<Register, 4> OpsToWaterfall;

    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {

      MI.eraseFromParent();

      return;

    }


    // Remove the original instruction to avoid potentially confusing the

    // waterfall loop logic.

    B.setInstr(*Span.begin());

    MI.eraseFromParent();

    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),

                           OpsToWaterfall);


    if (NeedCopyToVGPR) {

      MachineBasicBlock *LoopBB = Extract1->getParent();

      Register TmpReg0 = MRI.createGenericVirtualRegister(S32);

      Register TmpReg1 = MRI.createGenericVirtualRegister(S32);

      MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);

      MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);


      Extract0->getOperand(0).setReg(TmpReg0);

      Extract1->getOperand(0).setReg(TmpReg1);


      B.setInsertPt(*LoopBB, ++Extract1->getIterator());


      buildVCopy(B, DstRegs[0], TmpReg0);

      buildVCopy(B, DstRegs[1], TmpReg1);

    }


    if (ShouldMoveIndexIntoLoop)

      reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);


    return;

  }

  case AMDGPU::G_INSERT_VECTOR_ELT: {

    SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));


    Register DstReg = MI.getOperand(0).getReg();

    LLT VecTy = MRI.getType(DstReg);


    assert(OpdMapper.getVRegs(0).empty());

    assert(OpdMapper.getVRegs(3).empty());


    if (substituteSimpleCopyRegs(OpdMapper, 1))

      MRI.setType(MI.getOperand(1).getReg(), VecTy);


    if (foldInsertEltToCmpSelect(B, MI, OpdMapper))

      return;


    const RegisterBank *IdxBank =

      OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;


    Register SrcReg = MI.getOperand(1).getReg();

    Register InsReg = MI.getOperand(2).getReg();

    LLT InsTy = MRI.getType(InsReg);

    (void)InsTy;


    Register BaseIdxReg;

    unsigned ConstOffset;

    std::tie(BaseIdxReg, ConstOffset) =

        AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());


    // See if the index is an add of a constant which will be foldable by moving

    // the base register of the index later if this is going to be executed in a

    // waterfall loop. This is essentially to reassociate the add of a constant

    // with the readfirstlane.

    bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&

      ConstOffset > 0 &&

      ConstOffset < VecTy.getNumElements();


    // Move the base register. We'll re-insert the add later.

    if (ShouldMoveIndexIntoLoop)

      MI.getOperand(3).setReg(BaseIdxReg);


    if (InsRegs.empty()) {

      executeInWaterfallLoop(B, MI, {3});


      // Re-insert the constant offset add inside the waterfall loop.

      if (ShouldMoveIndexIntoLoop) {

        reinsertVectorIndexAdd(B, MI, 3, ConstOffset);

      }


      return;

    }


    assert(InsTy.getSizeInBits() == 64);


    const LLT S32 = LLT::scalar(32);

    LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);


    auto CastSrc = B.buildBitcast(Vec32, SrcReg);

    auto One = B.buildConstant(S32, 1);


    // Split the vector index into 32-bit pieces. Prepare to move all of the

    // new instructions into a waterfall loop if necessary.

    //

    // Don't put the bitcast or constant in the loop.

    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());


    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).

    auto IdxLo = B.buildShl(S32, BaseIdxReg, One);

    auto IdxHi = B.buildAdd(S32, IdxLo, One);


    auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);

    auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);


    const RegisterBank *DstBank =

      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    const RegisterBank *SrcBank =

      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;

    const RegisterBank *InsSrcBank =

      OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;


    MRI.setRegBank(InsReg, *InsSrcBank);

    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);

    MRI.setRegBank(InsLo.getReg(0), *DstBank);

    MRI.setRegBank(InsHi.getReg(0), *DstBank);

    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);

    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);

    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);


    SmallSet<Register, 4> OpsToWaterfall;

    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {

      B.setInsertPt(B.getMBB(), MI);

      B.buildBitcast(DstReg, InsHi);

      MI.eraseFromParent();

      return;

    }


    B.setInstr(*Span.begin());

    MI.eraseFromParent();


    // Figure out the point after the waterfall loop before mangling the control

    // flow.

    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),

                           OpsToWaterfall);


    // The insertion point is now right after the original instruction.

    //

    // Keep the bitcast to the original vector type out of the loop. Doing this

    // saved an extra phi we don't need inside the loop.

    B.buildBitcast(DstReg, InsHi);


    // Re-insert the constant offset add inside the waterfall loop.

    if (ShouldMoveIndexIntoLoop)

      reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);


    return;

  }

  case AMDGPU::G_AMDGPU_BUFFER_LOAD:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:

  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:

  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:

  case AMDGPU::G_AMDGPU_BUFFER_STORE:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:

  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:

  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {

    applyDefaultMapping(OpdMapper);

    executeInWaterfallLoop(B, MI, {1, 4});

    return;

  }

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {

    applyDefaultMapping(OpdMapper);

    executeInWaterfallLoop(B, MI, {2, 5});

    return;

  }

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {

    applyDefaultMapping(OpdMapper);

    executeInWaterfallLoop(B, MI, {3, 6});

    return;

  }

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {

    applyMappingSBufferLoad(B, OpdMapper);

    return;

  }

  case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:

    constrainOpWithReadfirstlane(B, MI, 0);

    constrainOpWithReadfirstlane(B, MI, 2);

    return;

  case AMDGPU::G_INTRINSIC:

  case AMDGPU::G_INTRINSIC_CONVERGENT: {

    switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

    case Intrinsic::amdgcn_readlane: {

      substituteSimpleCopyRegs(OpdMapper, 2);


      assert(OpdMapper.getVRegs(0).empty());

      assert(OpdMapper.getVRegs(3).empty());


      // Make sure the index is an SGPR. It doesn't make sense to run this in a

      // waterfall loop, so assume it's a uniform value.

      constrainOpWithReadfirstlane(B, MI, 3); // Index

      return;

    }

    case Intrinsic::amdgcn_writelane: {

      assert(OpdMapper.getVRegs(0).empty());

      assert(OpdMapper.getVRegs(2).empty());

      assert(OpdMapper.getVRegs(3).empty());


      substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val

      constrainOpWithReadfirstlane(B, MI, 2); // Source value

      constrainOpWithReadfirstlane(B, MI, 3); // Index

      return;

    }

    case Intrinsic::amdgcn_interp_p1:

    case Intrinsic::amdgcn_interp_p2:

    case Intrinsic::amdgcn_interp_mov:

    case Intrinsic::amdgcn_interp_p1_f16:

    case Intrinsic::amdgcn_interp_p2_f16:

    case Intrinsic::amdgcn_lds_param_load: {

      applyDefaultMapping(OpdMapper);


      // Readlane for m0 value, which is always the last operand.

      // FIXME: Should this be a waterfall loop instead?

      constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index

      return;

    }

    case Intrinsic::amdgcn_interp_inreg_p10:

    case Intrinsic::amdgcn_interp_inreg_p2:

    case Intrinsic::amdgcn_interp_inreg_p10_f16:

    case Intrinsic::amdgcn_interp_inreg_p2_f16:

    case Intrinsic::amdgcn_interp_p10_rtz_f16:

    case Intrinsic::amdgcn_interp_p2_rtz_f16:

    case Intrinsic::amdgcn_permlane16_swap:

    case Intrinsic::amdgcn_permlane32_swap:

      applyDefaultMapping(OpdMapper);

      return;

    case Intrinsic::amdgcn_permlane16:

    case Intrinsic::amdgcn_permlanex16: {

      // Doing a waterfall loop over these wouldn't make any sense.

      substituteSimpleCopyRegs(OpdMapper, 2);

      substituteSimpleCopyRegs(OpdMapper, 3);

      constrainOpWithReadfirstlane(B, MI, 4);

      constrainOpWithReadfirstlane(B, MI, 5);

      return;

    }

    case Intrinsic::amdgcn_permlane_bcast:

    case Intrinsic::amdgcn_permlane_up:

    case Intrinsic::amdgcn_permlane_down:

    case Intrinsic::amdgcn_permlane_xor:

      // Doing a waterfall loop over these wouldn't make any sense.

      constrainOpWithReadfirstlane(B, MI, 3);

      constrainOpWithReadfirstlane(B, MI, 4);

      return;

    case Intrinsic::amdgcn_permlane_idx_gen: {

      constrainOpWithReadfirstlane(B, MI, 3);

      return;

    }

    case Intrinsic::amdgcn_sbfe:

      applyMappingBFE(B, OpdMapper, true);

      return;

    case Intrinsic::amdgcn_ubfe:

      applyMappingBFE(B, OpdMapper, false);

      return;

    case Intrinsic::amdgcn_inverse_ballot:

    case Intrinsic::amdgcn_s_bitreplicate:

    case Intrinsic::amdgcn_s_quadmask:

    case Intrinsic::amdgcn_s_wqm:

      applyDefaultMapping(OpdMapper);

      constrainOpWithReadfirstlane(B, MI, 2); // Mask

      return;

    case Intrinsic::amdgcn_ballot:

      // Use default handling and insert copy to vcc source.

      break;

    }

    break;

  }

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {

    const AMDGPU::RsrcIntrinsic *RSrcIntrin =

        AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));

    assert(RSrcIntrin && RSrcIntrin->IsImage);

    // Non-images can have complications from operands that allow both SGPR

    // and VGPR. For now it's too complicated to figure out the final opcode

    // to derive the register bank from the MCInstrDesc.

    applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);

    return;

  }

  case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:

  case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:

  case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {

    bool IsDualOrBVH8 =

        MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||

        MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;

    unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier

    unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;

    applyDefaultMapping(OpdMapper);

    executeInWaterfallLoop(B, MI, {LastRegOpIdx});

    return;

  }

  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:

  case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {

    auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();

    switch (IntrID) {

    case Intrinsic::amdgcn_ds_ordered_add:

    case Intrinsic::amdgcn_ds_ordered_swap: {

      // This is only allowed to execute with 1 lane, so readfirstlane is safe.

      assert(OpdMapper.getVRegs(0).empty());

      substituteSimpleCopyRegs(OpdMapper, 3);

      constrainOpWithReadfirstlane(B, MI, 2); // M0

      return;

    }

    case Intrinsic::amdgcn_ds_gws_init:

    case Intrinsic::amdgcn_ds_gws_barrier:

    case Intrinsic::amdgcn_ds_gws_sema_br: {

      // Only the first lane is executes, so readfirstlane is safe.

      substituteSimpleCopyRegs(OpdMapper, 1);

      constrainOpWithReadfirstlane(B, MI, 2); // M0

      return;

    }

    case Intrinsic::amdgcn_ds_gws_sema_v:

    case Intrinsic::amdgcn_ds_gws_sema_p:

    case Intrinsic::amdgcn_ds_gws_sema_release_all: {

      // Only the first lane is executes, so readfirstlane is safe.

      constrainOpWithReadfirstlane(B, MI, 1); // M0

      return;

    }

    case Intrinsic::amdgcn_ds_append:

    case Intrinsic::amdgcn_ds_consume: {

      constrainOpWithReadfirstlane(B, MI, 2); // M0

      return;

    }

    case Intrinsic::amdgcn_s_sendmsg:

    case Intrinsic::amdgcn_s_sendmsghalt: {

      // FIXME: Should this use a waterfall loop?

      constrainOpWithReadfirstlane(B, MI, 2); // M0

      return;

    }

    case Intrinsic::amdgcn_s_setreg: {

      constrainOpWithReadfirstlane(B, MI, 2);

      return;

    }

    case Intrinsic::amdgcn_s_ttracedata:

      constrainOpWithReadfirstlane(B, MI, 1); // M0

      return;

    case Intrinsic::amdgcn_raw_buffer_load_lds:

    case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {

      applyDefaultMapping(OpdMapper);

      constrainOpWithReadfirstlane(B, MI, 1); // rsrc

      constrainOpWithReadfirstlane(B, MI, 2); // M0

      constrainOpWithReadfirstlane(B, MI, 5); // soffset

      return;

    }

    case Intrinsic::amdgcn_struct_buffer_load_lds:

    case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {

      applyDefaultMapping(OpdMapper);

      constrainOpWithReadfirstlane(B, MI, 1); // rsrc

      constrainOpWithReadfirstlane(B, MI, 2); // M0

      constrainOpWithReadfirstlane(B, MI, 6); // soffset

      return;

    }

    case Intrinsic::amdgcn_load_to_lds:

    case Intrinsic::amdgcn_global_load_lds: {

      applyDefaultMapping(OpdMapper);

      constrainOpWithReadfirstlane(B, MI, 2);

      return;

    }

    case Intrinsic::amdgcn_lds_direct_load: {

      applyDefaultMapping(OpdMapper);

      // Readlane for m0 value, which is always the last operand.

      constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index

      return;

    }

    case Intrinsic::amdgcn_exp_row:

      applyDefaultMapping(OpdMapper);

      constrainOpWithReadfirstlane(B, MI, 8); // M0

      return;

    case Intrinsic::amdgcn_s_sleep_var:

      assert(OpdMapper.getVRegs(1).empty());

      constrainOpWithReadfirstlane(B, MI, 1);

      return;

    case Intrinsic::amdgcn_s_barrier_join:

      constrainOpWithReadfirstlane(B, MI, 1);

      return;

    case Intrinsic::amdgcn_s_barrier_init:

    case Intrinsic::amdgcn_s_barrier_signal_var:

      constrainOpWithReadfirstlane(B, MI, 1);

      constrainOpWithReadfirstlane(B, MI, 2);

      return;

    case Intrinsic::amdgcn_s_get_barrier_state:

    case Intrinsic::amdgcn_s_get_named_barrier_state: {

      constrainOpWithReadfirstlane(B, MI, 2);

      return;

    }

    case Intrinsic::amdgcn_s_prefetch_data: {

      Register PtrReg = MI.getOperand(1).getReg();

      unsigned AS = MRI.getType(PtrReg).getAddressSpace();

      if (AMDGPU::isFlatGlobalAddrSpace(AS)) {

        constrainOpWithReadfirstlane(B, MI, 1);

        constrainOpWithReadfirstlane(B, MI, 2);

      } else

        MI.eraseFromParent();

      return;

    }

    case Intrinsic::amdgcn_tensor_load_to_lds:

    case Intrinsic::amdgcn_tensor_store_from_lds: {

      constrainOpWithReadfirstlane(B, MI, 1);

      constrainOpWithReadfirstlane(B, MI, 2);

      constrainOpWithReadfirstlane(B, MI, 3);

      constrainOpWithReadfirstlane(B, MI, 4);

      return;

    }

    case Intrinsic::amdgcn_tensor_load_to_lds_d2:

    case Intrinsic::amdgcn_tensor_store_from_lds_d2: {

      constrainOpWithReadfirstlane(B, MI, 1);

      constrainOpWithReadfirstlane(B, MI, 2);

      return;

    }

    default: {

      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =

              AMDGPU::lookupRsrcIntrinsic(IntrID)) {

        // Non-images can have complications from operands that allow both SGPR

        // and VGPR. For now it's too complicated to figure out the final opcode

        // to derive the register bank from the MCInstrDesc.

        if (RSrcIntrin->IsImage) {

          applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);

          return;

        }

      }


      break;

    }

    }

    break;

  }

  case AMDGPU::G_SI_CALL: {

    // Use a set to avoid extra readfirstlanes in the case where multiple

    // operands are the same register.

    SmallSet<Register, 4> SGPROperandRegs;


    if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))

      break;


    // Move all copies to physical SGPRs that are used by the call instruction

    // into the loop block. Start searching for these copies until the

    // ADJCALLSTACKUP.

    unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;

    unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;


    // Move all non-copies before the copies, so that a complete range can be

    // moved into the waterfall loop.

    SmallVector<MachineInstr *, 4> NonCopyInstrs;

    // Count of NonCopyInstrs found until the current LastCopy.

    unsigned NonCopyInstrsLen = 0;

    MachineBasicBlock::iterator Start(&MI);

    MachineBasicBlock::iterator LastCopy = Start;

    MachineBasicBlock *MBB = MI.getParent();

    const SIMachineFunctionInfo *Info =

        MBB->getParent()->getInfo<SIMachineFunctionInfo>();

    while (Start->getOpcode() != FrameSetupOpcode) {

      --Start;

      bool IsCopy = false;

      if (Start->getOpcode() == AMDGPU::COPY) {

        auto &Dst = Start->getOperand(0);

        if (Dst.isReg()) {

          Register Reg = Dst.getReg();

          if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {

            IsCopy = true;

          } else {

            // Also move the copy from the scratch rsrc descriptor into the loop

            // to allow it to be optimized away.

            auto &Src = Start->getOperand(1);

            if (Src.isReg()) {

              Reg = Src.getReg();

              IsCopy = Info->getScratchRSrcReg() == Reg;

            }

          }

        }

      }


      if (IsCopy) {

        LastCopy = Start;

        NonCopyInstrsLen = NonCopyInstrs.size();

      } else {

        NonCopyInstrs.push_back(&*Start);

      }

    }

    NonCopyInstrs.resize(NonCopyInstrsLen);


    for (auto *NonCopy : reverse(NonCopyInstrs)) {

      MBB->splice(LastCopy, MBB, NonCopy->getIterator());

    }

    Start = LastCopy;


    // Do the same for copies after the loop

    NonCopyInstrs.clear();

    NonCopyInstrsLen = 0;

    MachineBasicBlock::iterator End(&MI);

    LastCopy = End;

    while (End->getOpcode() != FrameDestroyOpcode) {

      ++End;

      bool IsCopy = false;

      if (End->getOpcode() == AMDGPU::COPY) {

        auto &Src = End->getOperand(1);

        if (Src.isReg()) {

          Register Reg = Src.getReg();

          IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);

        }

      }


      if (IsCopy) {

        LastCopy = End;

        NonCopyInstrsLen = NonCopyInstrs.size();

      } else {

        NonCopyInstrs.push_back(&*End);

      }

    }

    NonCopyInstrs.resize(NonCopyInstrsLen);


    End = LastCopy;

    ++LastCopy;

    for (auto *NonCopy : reverse(NonCopyInstrs)) {

      MBB->splice(LastCopy, MBB, NonCopy->getIterator());

    }


    ++End;

    B.setInsertPt(B.getMBB(), Start);

    executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);

    break;

  }

  case AMDGPU::G_LOAD:

  case AMDGPU::G_ZEXTLOAD:

  case AMDGPU::G_SEXTLOAD: {

    if (applyMappingLoad(B, OpdMapper, MI))

      return;

    break;

  }

  case AMDGPU::G_DYN_STACKALLOC:

    applyMappingDynStackAlloc(B, OpdMapper, MI);

    return;

  case AMDGPU::G_STACKRESTORE: {

    applyDefaultMapping(OpdMapper);

    constrainOpWithReadfirstlane(B, MI, 0);

    return;

  }

  case AMDGPU::G_SBFX:

    applyMappingBFE(B, OpdMapper, /*Signed*/ true);

    return;

  case AMDGPU::G_UBFX:

    applyMappingBFE(B, OpdMapper, /*Signed*/ false);

    return;

  case AMDGPU::G_AMDGPU_MAD_U64_U32:

  case AMDGPU::G_AMDGPU_MAD_I64_I32:

    applyMappingMAD_64_32(B, OpdMapper);

    return;

  case AMDGPU::G_PREFETCH: {

    if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {

      MI.eraseFromParent();

      return;

    }

    Register PtrReg = MI.getOperand(0).getReg();

    unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);

    if (PtrBank == AMDGPU::VGPRRegBankID &&

        (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {

      // Cannot do I$ prefetch with divergent pointer.

      MI.eraseFromParent();

      return;

    }

    unsigned AS = MRI.getType(PtrReg).getAddressSpace();

    if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&

         AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||

        (!Subtarget.hasSafeSmemPrefetch() &&

         (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

          !MI.getOperand(3).getImm() /* I$ prefetch */))) {

      MI.eraseFromParent();

      return;

    }

    applyDefaultMapping(OpdMapper);

    return;

  }

  default:

    break;

  }


  return applyDefaultMapping(OpdMapper);

}


// vgpr, sgpr -> vgpr

// vgpr, agpr -> vgpr

// agpr, agpr -> agpr

// agpr, sgpr -> vgpr

static unsigned regBankUnion(unsigned RB0, unsigned RB1) {

  if (RB0 == AMDGPU::InvalidRegBankID)

    return RB1;

  if (RB1 == AMDGPU::InvalidRegBankID)

    return RB0;


  if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)

    return AMDGPU::SGPRRegBankID;


  if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)

    return AMDGPU::AGPRRegBankID;


  return AMDGPU::VGPRRegBankID;

}


static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {

  if (RB0 == AMDGPU::InvalidRegBankID)

    return RB1;

  if (RB1 == AMDGPU::InvalidRegBankID)

    return RB0;


  // vcc, vcc -> vcc

  // vcc, sgpr -> vcc

  // vcc, vgpr -> vcc

  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)

    return AMDGPU::VCCRegBankID;


  // vcc, vgpr -> vgpr

  return regBankUnion(RB0, RB1);

}


unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,

                                                const MachineInstr &MI) const {

  unsigned RegBank = AMDGPU::InvalidRegBankID;


  for (const MachineOperand &MO : MI.operands()) {

    if (!MO.isReg())

      continue;

    Register Reg = MO.getReg();

    if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {

      RegBank = regBankUnion(RegBank, Bank->getID());

      if (RegBank == AMDGPU::VGPRRegBankID)

        break;

    }

  }


  return RegBank;

}


bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {

  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  for (const MachineOperand &MO : MI.operands()) {

    if (!MO.isReg())

      continue;

    Register Reg = MO.getReg();

    if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {

      if (Bank->getID() != AMDGPU::SGPRRegBankID)

        return false;

    }

  }

  return true;

}


const RegisterBankInfo::InstructionMapping &

AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {

  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());


  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

    const MachineOperand &SrcOp = MI.getOperand(i);

    if (!SrcOp.isReg())

      continue;


    unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);

    OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

  }

  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),

                               MI.getNumOperands());

}


const RegisterBankInfo::InstructionMapping &

AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {

  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());


  // Even though we technically could use SGPRs, this would require knowledge of

  // the constant bus restriction. Force all sources to VGPR (except for VCC).

  //

  // TODO: Unary ops are trivially OK, so accept SGPRs?

  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

    const MachineOperand &Src = MI.getOperand(i);

    if (!Src.isReg())

      continue;


    unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);

    unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;

    OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);

  }


  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),

                               MI.getNumOperands());

}


const RegisterBankInfo::InstructionMapping &

AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {

  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());


  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {

    const MachineOperand &Op = MI.getOperand(I);

    if (!Op.isReg())

      continue;


    unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);

    OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

  }


  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),

                               MI.getNumOperands());

}


const RegisterBankInfo::InstructionMapping &

AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,

                                        const MachineInstr &MI,

                                        int RsrcIdx) const {

  // The reported argument index is relative to the IR intrinsic call arguments,

  // so we need to shift by the number of defs and the intrinsic ID.

  RsrcIdx += MI.getNumExplicitDefs() + 1;


  const int NumOps = MI.getNumOperands();

  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);


  // TODO: Should packed/unpacked D16 difference be reported here as part of

  // the value mapping?

  for (int I = 0; I != NumOps; ++I) {

    if (!MI.getOperand(I).isReg())

      continue;


    Register OpReg = MI.getOperand(I).getReg();

    // We replace some dead address operands with $noreg

    if (!OpReg)

      continue;


    unsigned Size = getSizeInBits(OpReg, MRI, *TRI);


    // FIXME: Probably need a new intrinsic register bank searchable table to

    // handle arbitrary intrinsics easily.

    //

    // If this has a sampler, it immediately follows rsrc.

    const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;


    if (MustBeSGPR) {

      // If this must be an SGPR, so we must report whatever it is as legal.

      unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);

    } else {

      // Some operands must be VGPR, and these are easy to copy to.

      OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

    }

  }


  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);

}


/// Return the mapping for a pointer argument.

const RegisterBankInfo::ValueMapping *

AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,

                                              Register PtrReg) const {

  LLT PtrTy = MRI.getType(PtrReg);

  unsigned Size = PtrTy.getSizeInBits();

  if (Subtarget.useFlatForGlobal() ||

      !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))

    return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);


  // If we're using MUBUF instructions for global memory, an SGPR base register

  // is possible. Otherwise this needs to be a VGPR.

  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);

  return AMDGPU::getValueMapping(PtrBank->getID(), Size);

}


const RegisterBankInfo::InstructionMapping &

AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {


  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  SmallVector<const ValueMapping*, 2> OpdsMapping(2);

  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

  Register PtrReg = MI.getOperand(1).getReg();

  LLT PtrTy = MRI.getType(PtrReg);

  unsigned AS = PtrTy.getAddressSpace();

  unsigned PtrSize = PtrTy.getSizeInBits();


  const ValueMapping *ValMapping;

  const ValueMapping *PtrMapping;


  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);


  if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {

    if (isScalarLoadLegal(MI)) {

      // We have a uniform instruction so we want to use an SMRD load

      ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);

    } else {

      ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);


      // If we're using MUBUF instructions for global memory, an SGPR base

      // register is possible. Otherwise this needs to be a VGPR.

      unsigned PtrBankID = Subtarget.useFlatForGlobal() ?

        AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;


      PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);

    }

  } else {

    ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

    PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);

  }


  OpdsMapping[0] = ValMapping;

  OpdsMapping[1] = PtrMapping;

  const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(

      1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());

  return Mapping;


  // FIXME: Do we want to add a mapping for FLAT load, or should we just

  // handle that during instruction selection?

}


unsigned

AMDGPURegisterBankInfo::getRegBankID(Register Reg,

                                     const MachineRegisterInfo &MRI,

                                     unsigned Default) const {

  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);

  return Bank ? Bank->getID() : Default;

}


const RegisterBankInfo::ValueMapping *

AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,

                                         const MachineRegisterInfo &MRI,

                                         const TargetRegisterInfo &TRI) const {

  // Lie and claim anything is legal, even though this needs to be an SGPR

  // applyMapping will have to deal with it as a waterfall loop.

  unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);

  unsigned Size = getSizeInBits(Reg, MRI, TRI);

  return AMDGPU::getValueMapping(Bank, Size);

}


const RegisterBankInfo::ValueMapping *

AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,

                                         const MachineRegisterInfo &MRI,

                                         const TargetRegisterInfo &TRI) const {

  unsigned Size = getSizeInBits(Reg, MRI, TRI);

  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

}


const RegisterBankInfo::ValueMapping *

AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,

                                         const MachineRegisterInfo &MRI,

                                         const TargetRegisterInfo &TRI) const {

  unsigned Size = getSizeInBits(Reg, MRI, TRI);

  return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);

}


///

/// This function must return a legal mapping, because

/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called

/// in RegBankSelect::Mode::Fast.  Any mapping that would cause a

/// VGPR to SGPR generated is illegal.

///

// Operands that must be SGPRs must accept potentially divergent VGPRs as

// legal. These will be dealt with in applyMappingImpl.

//

const RegisterBankInfo::InstructionMapping &

AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {

  const MachineFunction &MF = *MI.getParent()->getParent();

  const MachineRegisterInfo &MRI = MF.getRegInfo();


  if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {

    Register DstReg = MI.getOperand(0).getReg();

    Register SrcReg = MI.getOperand(1).getReg();


    // The default logic bothers to analyze impossible alternative mappings. We

    // want the most straightforward mapping, so just directly handle this.

    const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);

    const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);

    assert(SrcBank && "src bank should have been assigned already");


    // For COPY between a physical reg and an s1, there is no type associated so

    // we need to take the virtual register's type as a hint on how to interpret

    // s1 values.

    if (!SrcReg.isVirtual() && !DstBank &&

        MRI.getType(DstReg) == LLT::scalar(1))

      DstBank = &AMDGPU::VCCRegBank;

    else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))

      DstBank = &AMDGPU::VCCRegBank;


    if (!DstBank)

      DstBank = SrcBank;


    unsigned Size = getSizeInBits(DstReg, MRI, *TRI);

    if (MI.getOpcode() != AMDGPU::G_FREEZE &&

        cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))

      return getInvalidInstructionMapping();


    const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);

    unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;

    SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);

    OpdsMapping[0] = &ValMap;

    if (MI.getOpcode() == AMDGPU::G_FREEZE)

      OpdsMapping[1] = &ValMap;


    return getInstructionMapping(

        1, /*Cost*/ 1,

        /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);

  }


  if (MI.isRegSequence()) {

    // If any input is a VGPR, the result must be a VGPR. The default handling

    // assumes any copy between banks is legal.

    unsigned BankID = AMDGPU::SGPRRegBankID;


    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {

      auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);

      // It doesn't make sense to use vcc or scc banks here, so just ignore

      // them.

      if (OpBank != AMDGPU::SGPRRegBankID) {

        BankID = AMDGPU::VGPRRegBankID;

        break;

      }

    }

    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);


    const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));

    return getInstructionMapping(

        1, /*Cost*/ 1,

        /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);

  }


  // The default handling is broken and doesn't handle illegal SGPR->VGPR copies

  // properly.

  //

  // TODO: There are additional exec masking dependencies to analyze.

  if (auto *PHI = dyn_cast<GPhi>(&MI)) {

    unsigned ResultBank = AMDGPU::InvalidRegBankID;

    Register DstReg = PHI->getReg(0);


    // Sometimes the result may have already been assigned a bank.

    if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))

      ResultBank = DstBank->getID();


    for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {

      Register Reg = PHI->getIncomingValue(I);

      const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);


      // FIXME: Assuming VGPR for any undetermined inputs.

      if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {

        ResultBank = AMDGPU::VGPRRegBankID;

        break;

      }


      // FIXME: Need to promote SGPR case to s32

      unsigned OpBank = Bank->getID();

      ResultBank = regBankBoolUnion(ResultBank, OpBank);

    }


    assert(ResultBank != AMDGPU::InvalidRegBankID);


    unsigned Size = MRI.getType(DstReg).getSizeInBits();


    const ValueMapping &ValMap =

        getValueMapping(0, Size, getRegBank(ResultBank));

    return getInstructionMapping(

        1, /*Cost*/ 1,

        /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);

  }


  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);

  if (Mapping.isValid())

    return Mapping;


  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());


  switch (MI.getOpcode()) {

  default:

    return getInvalidInstructionMapping();


  case AMDGPU::G_AND:

  case AMDGPU::G_OR:

  case AMDGPU::G_XOR:

  case AMDGPU::G_MUL: {

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    if (Size == 1) {

      const RegisterBank *DstBank

        = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);


      unsigned TargetBankID = AMDGPU::InvalidRegBankID;

      unsigned BankLHS = AMDGPU::InvalidRegBankID;

      unsigned BankRHS = AMDGPU::InvalidRegBankID;

      if (DstBank) {

        TargetBankID = DstBank->getID();

        if (DstBank == &AMDGPU::VCCRegBank) {

          TargetBankID = AMDGPU::VCCRegBankID;

          BankLHS = AMDGPU::VCCRegBankID;

          BankRHS = AMDGPU::VCCRegBankID;

        } else {

          BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,

                                 AMDGPU::SGPRRegBankID);

          BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,

                                 AMDGPU::SGPRRegBankID);

        }

      } else {

        BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,

                               AMDGPU::VCCRegBankID);

        BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,

                               AMDGPU::VCCRegBankID);


        // Both inputs should be true booleans to produce a boolean result.

        if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {

          TargetBankID = AMDGPU::VGPRRegBankID;

        } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {

          TargetBankID = AMDGPU::VCCRegBankID;

          BankLHS = AMDGPU::VCCRegBankID;

          BankRHS = AMDGPU::VCCRegBankID;

        } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {

          TargetBankID = AMDGPU::SGPRRegBankID;

        }

      }


      OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);

      OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);

      break;

    }


    if (Size == 64) {


      if (isSALUMapping(MI)) {

        OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);

        OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];

      } else {

        if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())

          OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

        else

          OpdsMapping[0] =

              getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);

        unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);

        OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);


        unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);

        OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);

      }


      break;

    }


    [[fallthrough]];

  }

  case AMDGPU::G_PTR_ADD:

  case AMDGPU::G_PTRMASK:

  case AMDGPU::G_ADD:

  case AMDGPU::G_SUB:

  case AMDGPU::G_SHL:

  case AMDGPU::G_LSHR:

  case AMDGPU::G_ASHR:

  case AMDGPU::G_UADDO:

  case AMDGPU::G_USUBO:

  case AMDGPU::G_UADDE:

  case AMDGPU::G_SADDE:

  case AMDGPU::G_USUBE:

  case AMDGPU::G_SSUBE:

  case AMDGPU::G_ABS:

  case AMDGPU::G_SHUFFLE_VECTOR:

  case AMDGPU::G_SBFX:

  case AMDGPU::G_UBFX:

  case AMDGPU::G_AMDGPU_S_MUL_I64_I32:

  case AMDGPU::G_AMDGPU_S_MUL_U64_U32:

    if (isSALUMapping(MI))

      return getDefaultMappingSOP(MI);

    return getDefaultMappingVOP(MI);

  case AMDGPU::G_SMIN:

  case AMDGPU::G_SMAX:

  case AMDGPU::G_UMIN:

  case AMDGPU::G_UMAX:

    if (isSALUMapping(MI)) {

      // There are no scalar 64-bit min and max, use vector instruction instead.

      if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 &&

          Subtarget.hasIntMinMax64())

        return getDefaultMappingVOP(MI);

      return getDefaultMappingSOP(MI);

    }

    return getDefaultMappingVOP(MI);

  case AMDGPU::G_FADD:

  case AMDGPU::G_FSUB:

  case AMDGPU::G_FMUL:

  case AMDGPU::G_FMA:

  case AMDGPU::G_FFLOOR:

  case AMDGPU::G_FCEIL:

  case AMDGPU::G_INTRINSIC_ROUNDEVEN:

  case AMDGPU::G_FMINNUM:

  case AMDGPU::G_FMAXNUM:

  case AMDGPU::G_FMINIMUM:

  case AMDGPU::G_FMAXIMUM:

  case AMDGPU::G_FMINIMUMNUM:

  case AMDGPU::G_FMAXIMUMNUM:

  case AMDGPU::G_INTRINSIC_TRUNC:

  case AMDGPU::G_STRICT_FADD:

  case AMDGPU::G_STRICT_FSUB:

  case AMDGPU::G_STRICT_FMUL:

  case AMDGPU::G_STRICT_FMA: {

    LLT Ty = MRI.getType(MI.getOperand(0).getReg());

    unsigned Size = Ty.getSizeInBits();

    if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&

        (Size == 32 || Size == 16) && isSALUMapping(MI))

      return getDefaultMappingSOP(MI);

    return getDefaultMappingVOP(MI);

  }

  case AMDGPU::G_FPTOSI:

  case AMDGPU::G_FPTOUI:

  case AMDGPU::G_SITOFP:

  case AMDGPU::G_UITOFP: {

    unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

    if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&

        isSALUMapping(MI))

      return getDefaultMappingSOP(MI);

    return getDefaultMappingVOP(MI);

  }

  case AMDGPU::G_FPTRUNC:

  case AMDGPU::G_FPEXT: {

    unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

    if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&

        isSALUMapping(MI))

      return getDefaultMappingSOP(MI);

    return getDefaultMappingVOP(MI);

  }

  case AMDGPU::G_FSQRT:

  case AMDGPU::G_FEXP2:

  case AMDGPU::G_FLOG2: {

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&

        isSALUMapping(MI))

      return getDefaultMappingSOP(MI);

    return getDefaultMappingVOP(MI);

  }

  case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU

  case AMDGPU::G_SSUBSAT:

  case AMDGPU::G_UADDSAT:

  case AMDGPU::G_USUBSAT:

  case AMDGPU::G_FMAD:

  case AMDGPU::G_FLDEXP:

  case AMDGPU::G_FMINNUM_IEEE:

  case AMDGPU::G_FMAXNUM_IEEE:

  case AMDGPU::G_FCANONICALIZE:

  case AMDGPU::G_STRICT_FLDEXP:

  case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?

  case AMDGPU::G_FSHR: // TODO: Expand for scalar

  case AMDGPU::G_AMDGPU_FMIN_LEGACY:

  case AMDGPU::G_AMDGPU_FMAX_LEGACY:

  case AMDGPU::G_AMDGPU_RCP_IFLAG:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:

  case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:

  case AMDGPU::G_AMDGPU_SMED3:

  case AMDGPU::G_AMDGPU_FMED3:

    return getDefaultMappingVOP(MI);

  case AMDGPU::G_UMULH:

  case AMDGPU::G_SMULH: {

    if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))

      return getDefaultMappingSOP(MI);

    return getDefaultMappingVOP(MI);

  }

  case AMDGPU::G_AMDGPU_MAD_U64_U32:

  case AMDGPU::G_AMDGPU_MAD_I64_I32: {

    // Three possible mappings:

    //

    //  - Default SOP

    //  - Default VOP

    //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.

    //

    // This allows instruction selection to keep the multiplication part of the

    // instruction on the SALU.

    bool AllSalu = true;

    bool MulSalu = true;

    for (unsigned i = 0; i < 5; ++i) {

      Register Reg = MI.getOperand(i).getReg();

      if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {

        if (Bank->getID() != AMDGPU::SGPRRegBankID) {

          AllSalu = false;

          if (i == 2 || i == 3) {

            MulSalu = false;

            break;

          }

        }

      }

    }


    if (AllSalu)

      return getDefaultMappingSOP(MI);


    // If the multiply-add is full-rate in VALU, use that even if the

    // multiplication part is scalar. Accumulating separately on the VALU would

    // take two instructions.

    if (!MulSalu || Subtarget.hasFullRate64Ops())

      return getDefaultMappingVOP(MI);


    // Keep the multiplication on the SALU, then accumulate on the VALU.

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);

    OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

    OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

    OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

    OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);

    break;

  }

  case AMDGPU::G_IMPLICIT_DEF: {

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

    break;

  }

  case AMDGPU::G_FCONSTANT:

  case AMDGPU::G_CONSTANT:

  case AMDGPU::G_GLOBAL_VALUE:

  case AMDGPU::G_FRAME_INDEX:

  case AMDGPU::G_BLOCK_ADDR:

  case AMDGPU::G_READSTEADYCOUNTER:

  case AMDGPU::G_READCYCLECOUNTER: {

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

    break;

  }

  case AMDGPU::G_DYN_STACKALLOC: {

    // Result is always uniform, and a wave reduction is needed for the source.

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);

    break;

  }

  case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {

    // This case is weird because we expect a physical register in the source,

    // but need to set a bank anyway.

    //

    // TODO: We could select the result to SGPR or VGPR

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

    OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

    break;

  }

  case AMDGPU::G_INSERT: {

    unsigned BankID = getMappingType(MRI, MI);

    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

    unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);

    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);

    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);

    OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);

    OpdsMapping[3] = nullptr;

    break;

  }

  case AMDGPU::G_EXTRACT: {

    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

    unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

    unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);

    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);

    OpdsMapping[2] = nullptr;

    break;

  }

  case AMDGPU::G_BUILD_VECTOR:

  case AMDGPU::G_BUILD_VECTOR_TRUNC: {

    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());

    if (DstTy == LLT::fixed_vector(2, 16)) {

      unsigned DstSize = DstTy.getSizeInBits();

      unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

      unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

      unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);

      unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);


      OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);

      OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);

      OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);

      break;

    }


    [[fallthrough]];

  }

  case AMDGPU::G_MERGE_VALUES:

  case AMDGPU::G_CONCAT_VECTORS: {

    unsigned Bank = getMappingType(MRI, MI);

    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();


    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);

    // Op1 and Dst should use the same register bank.

    for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)

      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);

    break;

  }

  case AMDGPU::G_BITREVERSE:

  case AMDGPU::G_BITCAST:

  case AMDGPU::G_INTTOPTR:

  case AMDGPU::G_PTRTOINT:

  case AMDGPU::G_FABS:

  case AMDGPU::G_FNEG: {

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

    OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);

    break;

  }

  case AMDGPU::G_AMDGPU_FFBH_U32:

  case AMDGPU::G_AMDGPU_FFBL_B32:

  case AMDGPU::G_CTLZ_ZERO_UNDEF:

  case AMDGPU::G_CTTZ_ZERO_UNDEF: {

    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);

    OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);

    break;

  }

  case AMDGPU::G_CTPOP: {

    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);


    // This should really be getValueMappingSGPR64Only, but allowing the generic

    // code to handle the register split just makes using LegalizerHelper more

    // difficult.

    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);

    break;

  }

  case AMDGPU::G_TRUNC: {

    Register Dst = MI.getOperand(0).getReg();

    Register Src = MI.getOperand(1).getReg();

    unsigned Bank = getRegBankID(Src, MRI);

    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);

    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);

    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);

    OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);

    break;

  }

  case AMDGPU::G_ZEXT:

  case AMDGPU::G_SEXT:

  case AMDGPU::G_ANYEXT:

  case AMDGPU::G_SEXT_INREG: {

    Register Dst = MI.getOperand(0).getReg();

    Register Src = MI.getOperand(1).getReg();

    unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);

    unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);


    unsigned DstBank;

    const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);

    assert(SrcBank);

    switch (SrcBank->getID()) {

    case AMDGPU::SGPRRegBankID:

      DstBank = AMDGPU::SGPRRegBankID;

      break;

    default:

      DstBank = AMDGPU::VGPRRegBankID;

      break;

    }


    // Scalar extend can use 64-bit BFE, but VGPRs require extending to

    // 32-bits, and then to 64.

    OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);

    OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),

                                                       SrcSize);

    break;

  }

  case AMDGPU::G_IS_FPCLASS: {

    Register SrcReg = MI.getOperand(1).getReg();

    unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();

    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);

    OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);

    break;

  }

  case AMDGPU::G_STORE: {

    assert(MI.getOperand(0).isReg());

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();


    // FIXME: We need to specify a different reg bank once scalar stores are

    // supported.

    const ValueMapping *ValMapping =

        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

    OpdsMapping[0] = ValMapping;

    OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());

    break;

  }

  case AMDGPU::G_ICMP:

  case AMDGPU::G_FCMP: {

    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();


    // See if the result register has already been constrained to vcc, which may

    // happen due to control flow intrinsic lowering.

    unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,

                                    AMDGPU::SGPRRegBankID);

    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);

    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);


    auto canUseSCCICMP = [&]() {

      auto Pred =

          static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());

      return Size == 32 ||

             (Size == 64 &&

              (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&

              Subtarget.hasScalarCompareEq64());

    };

    auto canUseSCCFCMP = [&]() {

      return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);

    };


    bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;

    bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&

                     Op2Bank == AMDGPU::SGPRRegBankID &&

                     Op3Bank == AMDGPU::SGPRRegBankID &&

                     (isICMP ? canUseSCCICMP() : canUseSCCFCMP());


    DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;

    unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;


    // TODO: Use 32-bit for scalar output size.

    // SCC results will need to be copied to a 32-bit SGPR virtual register.

    const unsigned ResultSize = 1;


    OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);

    OpdsMapping[1] = nullptr; // Predicate Operand.

    OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);

    OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);

    break;

  }

  case AMDGPU::G_EXTRACT_VECTOR_ELT: {

    // VGPR index can be used for waterfall when indexing a SGPR vector.

    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);

    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

    unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

    unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);

    unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);


    OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);

    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);


    // The index can be either if the source vector is VGPR.

    OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);

    break;

  }

  case AMDGPU::G_INSERT_VECTOR_ELT: {

    unsigned OutputBankID = isSALUMapping(MI) ?

      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;


    unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

    unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();

    unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);

    unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);


    OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);

    OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);


    // This is a weird case, because we need to break down the mapping based on

    // the register bank of a different operand.

    if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {

      OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,

                                                      InsertSize);

    } else {

      assert(InsertSize == 32 || InsertSize == 64);

      OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);

    }


    // The index can be either if the source vector is VGPR.

    OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);

    break;

  }

  case AMDGPU::G_UNMERGE_VALUES: {

    unsigned Bank = getMappingType(MRI, MI);


    // Op1 and Dst should use the same register bank.

    // FIXME: Shouldn't this be the default? Why do we need to handle this?

    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

      unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);

      OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);

    }

    break;

  }

  case AMDGPU::G_AMDGPU_BUFFER_LOAD:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:

  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:

  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:

  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:

  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:

  case AMDGPU::G_AMDGPU_BUFFER_STORE:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:

  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {

    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);


    // rsrc

    OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);


    // vindex

    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);


    // voffset

    OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);


    // soffset

    OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);


    // Any remaining operands are immediates and were correctly null

    // initialized.

    break;

  }

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {

    // vdata_out

    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);


    // vdata_in

    OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);


    // rsrc

    OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);


    // vindex

    OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);


    // voffset

    OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);


    // soffset

    OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);


    // Any remaining operands are immediates and were correctly null

    // initialized.

    break;

  }

  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {

    // vdata_out

    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);


    // vdata_in

    OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);


    // cmp

    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);


    // rsrc

    OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);


    // vindex

    OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);


    // voffset

    OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);


    // soffset

    OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);


    // Any remaining operands are immediates and were correctly null

    // initialized.

    break;

  }

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:

  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {

    // Lie and claim everything is legal, even though some need to be

    // SGPRs. applyMapping will have to deal with it as a waterfall loop.

    OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

    OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);


    // We need to convert this to a MUBUF if either the resource of offset is

    // VGPR.

    unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();

    unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();

    unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);


    unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);

    break;

  }

  case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:

    OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

    OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

    break;

  case AMDGPU::G_INTRINSIC:

  case AMDGPU::G_INTRINSIC_CONVERGENT: {

    switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

    default:

      return getInvalidInstructionMapping();

    case Intrinsic::amdgcn_div_fmas:

    case Intrinsic::amdgcn_div_fixup:

    case Intrinsic::amdgcn_trig_preop:

    case Intrinsic::amdgcn_sin:

    case Intrinsic::amdgcn_cos:

    case Intrinsic::amdgcn_log_clamp:

    case Intrinsic::amdgcn_rcp_legacy:

    case Intrinsic::amdgcn_rsq_legacy:

    case Intrinsic::amdgcn_rsq_clamp:

    case Intrinsic::amdgcn_tanh:

    case Intrinsic::amdgcn_fmul_legacy:

    case Intrinsic::amdgcn_fma_legacy:

    case Intrinsic::amdgcn_frexp_mant:

    case Intrinsic::amdgcn_frexp_exp:

    case Intrinsic::amdgcn_fract:

    case Intrinsic::amdgcn_cvt_pknorm_i16:

    case Intrinsic::amdgcn_cvt_pknorm_u16:

    case Intrinsic::amdgcn_cvt_pk_i16:

    case Intrinsic::amdgcn_cvt_pk_u16:

    case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:

    case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:

    case Intrinsic::amdgcn_cvt_pk_f16_fp8:

    case Intrinsic::amdgcn_cvt_pk_f16_bf8:

    case Intrinsic::amdgcn_cvt_pk_fp8_f16:

    case Intrinsic::amdgcn_cvt_pk_bf8_f16:

    case Intrinsic::amdgcn_cvt_sr_fp8_f16:

    case Intrinsic::amdgcn_cvt_sr_bf8_f16:

    case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:

    case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:

    case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:

    case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:

    case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:

    case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:

    case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:

    case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:

    case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:

    case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6:

    case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6:

    case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6:

    case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6:

    case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6:

    case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:

    case Intrinsic::amdgcn_sat_pk4_i4_i8:

    case Intrinsic::amdgcn_sat_pk4_u4_u8:

    case Intrinsic::amdgcn_fmed3:

    case Intrinsic::amdgcn_cubeid:

    case Intrinsic::amdgcn_cubema:

    case Intrinsic::amdgcn_cubesc:

    case Intrinsic::amdgcn_cubetc:

    case Intrinsic::amdgcn_sffbh:

    case Intrinsic::amdgcn_fmad_ftz:

    case Intrinsic::amdgcn_mbcnt_lo:

    case Intrinsic::amdgcn_mbcnt_hi:

    case Intrinsic::amdgcn_mul_u24:

    case Intrinsic::amdgcn_mul_i24:

    case Intrinsic::amdgcn_mulhi_u24:

    case Intrinsic::amdgcn_mulhi_i24:

    case Intrinsic::amdgcn_lerp:

    case Intrinsic::amdgcn_sad_u8:

    case Intrinsic::amdgcn_msad_u8:

    case Intrinsic::amdgcn_sad_hi_u8:

    case Intrinsic::amdgcn_sad_u16:

    case Intrinsic::amdgcn_qsad_pk_u16_u8:

    case Intrinsic::amdgcn_mqsad_pk_u16_u8:

    case Intrinsic::amdgcn_mqsad_u32_u8:

    case Intrinsic::amdgcn_cvt_pk_u8_f32:

    case Intrinsic::amdgcn_alignbyte:

    case Intrinsic::amdgcn_perm:

    case Intrinsic::amdgcn_prng_b32:

    case Intrinsic::amdgcn_fdot2:

    case Intrinsic::amdgcn_sdot2:

    case Intrinsic::amdgcn_udot2:

    case Intrinsic::amdgcn_sdot4:

    case Intrinsic::amdgcn_udot4:

    case Intrinsic::amdgcn_sdot8:

    case Intrinsic::amdgcn_udot8:

    case Intrinsic::amdgcn_fdot2_bf16_bf16:

    case Intrinsic::amdgcn_fdot2_f16_f16:

    case Intrinsic::amdgcn_fdot2_f32_bf16:

    case Intrinsic::amdgcn_fdot2c_f32_bf16:

    case Intrinsic::amdgcn_sudot4:

    case Intrinsic::amdgcn_sudot8:

    case Intrinsic::amdgcn_dot4_f32_fp8_bf8:

    case Intrinsic::amdgcn_dot4_f32_bf8_fp8:

    case Intrinsic::amdgcn_dot4_f32_fp8_fp8:

    case Intrinsic::amdgcn_dot4_f32_bf8_bf8:

    case Intrinsic::amdgcn_cvt_f32_fp8:

    case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:

    case Intrinsic::amdgcn_cvt_f32_bf8:

    case Intrinsic::amdgcn_cvt_off_f32_i4:

    case Intrinsic::amdgcn_cvt_pk_f32_fp8:

    case Intrinsic::amdgcn_cvt_pk_f32_bf8:

    case Intrinsic::amdgcn_cvt_pk_fp8_f32:

    case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:

    case Intrinsic::amdgcn_cvt_pk_bf8_f32:

    case Intrinsic::amdgcn_cvt_sr_fp8_f32:

    case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:

    case Intrinsic::amdgcn_cvt_sr_bf8_f32:

    case Intrinsic::amdgcn_cvt_sr_bf16_f32:

    case Intrinsic::amdgcn_cvt_sr_f16_f32:

    case Intrinsic::amdgcn_cvt_f16_fp8:

    case Intrinsic::amdgcn_cvt_f16_bf8:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:

    case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:

    case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:

    case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:

    case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:

    case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:

    case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:

    case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:

    case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:

    case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:

    case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:

    case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:

    case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:

    case Intrinsic::amdgcn_ashr_pk_i8_i32:

    case Intrinsic::amdgcn_ashr_pk_u8_i32:

    case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:

    case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:

    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:

    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:

    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:

    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:

    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:

    case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:

    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:

    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:

    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:

    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:

    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:

    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:

    case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:

    case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:

    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:

    case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:

    case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:

    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:

    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:

    case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:

    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:

    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:

    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:

    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:

    case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:

    case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:

    case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:

    case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:

    case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:

    case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:

    case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:

    case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:

    case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:

    case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:

    case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:

    case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:

    case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:

    case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:

    case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:

    case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:

    case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:

    case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:

    case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:

    case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:

    case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:

    case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:

    case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:

    case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:

    case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:

    case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:

    case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:

    case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:

    case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:

    case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:

    case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:

    case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:

    case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:

    case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:

    case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:

    case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:

    case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:

    case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:

    case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:

    case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:

    case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:

    case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:

    case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:

    case Intrinsic::amdgcn_perm_pk16_b4_u4:

    case Intrinsic::amdgcn_perm_pk16_b6_u4:

    case Intrinsic::amdgcn_perm_pk16_b8_u4:

      return getDefaultMappingVOP(MI);

    case Intrinsic::amdgcn_log:

    case Intrinsic::amdgcn_exp2:

    case Intrinsic::amdgcn_rcp:

    case Intrinsic::amdgcn_rsq:

    case Intrinsic::amdgcn_sqrt: {

      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&

          isSALUMapping(MI))

        return getDefaultMappingSOP(MI);

      return getDefaultMappingVOP(MI);

    }

    case Intrinsic::amdgcn_sbfe:

    case Intrinsic::amdgcn_ubfe:

      if (isSALUMapping(MI))

        return getDefaultMappingSOP(MI);

      return getDefaultMappingVOP(MI);

    case Intrinsic::amdgcn_ds_swizzle:

    case Intrinsic::amdgcn_ds_permute:

    case Intrinsic::amdgcn_ds_bpermute:

    case Intrinsic::amdgcn_update_dpp:

    case Intrinsic::amdgcn_mov_dpp8:

    case Intrinsic::amdgcn_mov_dpp:

    case Intrinsic::amdgcn_strict_wwm:

    case Intrinsic::amdgcn_wwm:

    case Intrinsic::amdgcn_strict_wqm:

    case Intrinsic::amdgcn_wqm:

    case Intrinsic::amdgcn_softwqm:

    case Intrinsic::amdgcn_set_inactive:

    case Intrinsic::amdgcn_set_inactive_chain_arg:

    case Intrinsic::amdgcn_permlane64:

    case Intrinsic::amdgcn_ds_bpermute_fi_b32:

      return getDefaultMappingAllVGPR(MI);

    case Intrinsic::amdgcn_cvt_pkrtz:

      if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))

        return getDefaultMappingSOP(MI);

      return getDefaultMappingVOP(MI);

    case Intrinsic::amdgcn_kernarg_segment_ptr:

    case Intrinsic::amdgcn_s_getpc:

    case Intrinsic::amdgcn_groupstaticsize:

    case Intrinsic::amdgcn_reloc_constant:

    case Intrinsic::returnaddress: {

      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_wqm_vote: {

      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = OpdsMapping[2]

        = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_ps_live: {

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

      break;

    }

    case Intrinsic::amdgcn_div_scale: {

      unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);

      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);


      unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);

      break;

    }

    case Intrinsic::amdgcn_class: {

      Register Src0Reg = MI.getOperand(2).getReg();

      Register Src1Reg = MI.getOperand(3).getReg();

      unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();

      unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);

      break;

    }

    case Intrinsic::amdgcn_icmp:

    case Intrinsic::amdgcn_fcmp: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      // This is not VCCRegBank because this is not used in boolean contexts.

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);

      unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);

      break;

    }

    case Intrinsic::amdgcn_readlane: {

      // This must be an SGPR, but accept a VGPR.

      Register IdxReg = MI.getOperand(3).getReg();

      unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();

      unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);

      [[fallthrough]];

    }

    case Intrinsic::amdgcn_readfirstlane: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);

      break;

    }

    case Intrinsic::amdgcn_writelane: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      Register SrcReg = MI.getOperand(2).getReg();

      unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();

      unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);

      Register IdxReg = MI.getOperand(3).getReg();

      unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();

      unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);


      // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted

      // to legalize.

      OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);

      OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);

      break;

    }

    case Intrinsic::amdgcn_if_break: {

      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_permlane16:

    case Intrinsic::amdgcn_permlanex16: {

      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_permlane_bcast:

    case Intrinsic::amdgcn_permlane_up:

    case Intrinsic::amdgcn_permlane_down:

    case Intrinsic::amdgcn_permlane_xor: {

      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_permlane_idx_gen: {

      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_permlane16_var:

    case Intrinsic::amdgcn_permlanex16_var: {

      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_mfma_f32_4x4x1f32:

    case Intrinsic::amdgcn_mfma_f32_4x4x4f16:

    case Intrinsic::amdgcn_mfma_i32_4x4x4i8:

    case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:

    case Intrinsic::amdgcn_mfma_f32_16x16x1f32:

    case Intrinsic::amdgcn_mfma_f32_16x16x4f32:

    case Intrinsic::amdgcn_mfma_f32_16x16x4f16:

    case Intrinsic::amdgcn_mfma_f32_16x16x16f16:

    case Intrinsic::amdgcn_mfma_i32_16x16x4i8:

    case Intrinsic::amdgcn_mfma_i32_16x16x16i8:

    case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:

    case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:

    case Intrinsic::amdgcn_mfma_f32_32x32x1f32:

    case Intrinsic::amdgcn_mfma_f32_32x32x2f32:

    case Intrinsic::amdgcn_mfma_f32_32x32x4f16:

    case Intrinsic::amdgcn_mfma_f32_32x32x8f16:

    case Intrinsic::amdgcn_mfma_i32_32x32x4i8:

    case Intrinsic::amdgcn_mfma_i32_32x32x8i8:

    case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:

    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:

    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:

    case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:

    case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:

    case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:

    case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:

    case Intrinsic::amdgcn_mfma_f64_16x16x4f64:

    case Intrinsic::amdgcn_mfma_f64_4x4x4f64:

    case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:

    case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:

    case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:

    case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:

    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:

    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:

    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:

    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:

    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:

    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:

    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:

    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:

    case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:

    case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:

    case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:

    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:

    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {

      // Default for MAI intrinsics.

      // srcC can also be an immediate which can be folded later.

      // FIXME: Should we eventually add an alternative mapping with AGPR src

      // for srcA/srcB?

      //

      // vdst, srcA, srcB, srcC

      const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

      OpdsMapping[0] =

          Info->mayNeedAGPRs()

              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)

              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] =

          Info->mayNeedAGPRs()

              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)

              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:

    case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {

      const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

      OpdsMapping[0] =

          Info->mayNeedAGPRs()

              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)

              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);


      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] =

          Info->mayNeedAGPRs()

              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)

              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);


      OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);

      OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:

    case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:

    case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:

    case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:

    case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:

    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:

    case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:

    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:

    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:

    case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:

    case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:

    case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {

      // vdst, srcA, srcB, srcC, idx

      OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_interp_p1:

    case Intrinsic::amdgcn_interp_p2:

    case Intrinsic::amdgcn_interp_mov:

    case Intrinsic::amdgcn_interp_p1_f16:

    case Intrinsic::amdgcn_interp_p2_f16:

    case Intrinsic::amdgcn_lds_param_load: {

      const int M0Idx = MI.getNumOperands() - 1;

      Register M0Reg = MI.getOperand(M0Idx).getReg();

      unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();


      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

      for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)

        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);


      // Must be SGPR, but we must take whatever the original bank is and fix it

      // later.

      OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_interp_inreg_p10:

    case Intrinsic::amdgcn_interp_inreg_p2:

    case Intrinsic::amdgcn_interp_inreg_p10_f16:

    case Intrinsic::amdgcn_interp_inreg_p2_f16:

    case Intrinsic::amdgcn_interp_p10_rtz_f16:

    case Intrinsic::amdgcn_interp_p2_rtz_f16: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      break;

    }

    case Intrinsic::amdgcn_permlane16_swap:

    case Intrinsic::amdgcn_permlane32_swap: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =

          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

      break;

    }

    case Intrinsic::amdgcn_ballot: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);

      break;

    }

    case Intrinsic::amdgcn_inverse_ballot: {

      // This must be an SGPR, but accept a VGPR.

      Register MaskReg = MI.getOperand(2).getReg();

      unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();

      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);

      break;

    }

    case Intrinsic::amdgcn_bitop3: {

      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_s_quadmask:

    case Intrinsic::amdgcn_s_wqm: {

      Register MaskReg = MI.getOperand(2).getReg();

      unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();

      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);

      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);

      break;

    }

    case Intrinsic::amdgcn_wave_reduce_add:

    case Intrinsic::amdgcn_wave_reduce_sub:

    case Intrinsic::amdgcn_wave_reduce_min:

    case Intrinsic::amdgcn_wave_reduce_umin:

    case Intrinsic::amdgcn_wave_reduce_max:

    case Intrinsic::amdgcn_wave_reduce_umax:

    case Intrinsic::amdgcn_wave_reduce_and:

    case Intrinsic::amdgcn_wave_reduce_or:

    case Intrinsic::amdgcn_wave_reduce_xor: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);

      unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

      auto regBankID =

          isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;

      OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);

      break;

    }

    case Intrinsic::amdgcn_s_bitreplicate:

      Register MaskReg = MI.getOperand(2).getReg();

      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);

      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);

    }

    break;

  }

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {

    auto IntrID = AMDGPU::getIntrinsicID(MI);

    const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);

    assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");

    // Non-images can have complications from operands that allow both SGPR

    // and VGPR. For now it's too complicated to figure out the final opcode

    // to derive the register bank from the MCInstrDesc.

    assert(RSrcIntrin->IsImage);

    return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);

  }

  case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:

  case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:

  case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {

    bool IsDualOrBVH8 =

        MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||

        MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;

    unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier

    unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;

    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

    if (IsDualOrBVH8) {

      OpdsMapping[1] = AMDGPU::getValueMapping(

          AMDGPU::VGPRRegBankID,

          MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());

      OpdsMapping[2] = AMDGPU::getValueMapping(

          AMDGPU::VGPRRegBankID,

          MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());

    }

    OpdsMapping[LastRegOpIdx] =

        getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI);

    if (LastRegOpIdx == 3) {

      // Sequential form: all operands combined into VGPR256/VGPR512

      unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

      if (Size > 256)

        Size = 512;

      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

    } else {

      // NSA form

      unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2;

      for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {

        unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();

        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);

      }

    }

    break;

  }

  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:

  case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {

    auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();

    switch (IntrID) {

    case Intrinsic::amdgcn_s_getreg:

    case Intrinsic::amdgcn_s_memtime:

    case Intrinsic::amdgcn_s_memrealtime:

    case Intrinsic::amdgcn_s_get_waveid_in_workgroup:

    case Intrinsic::amdgcn_s_sendmsg_rtn: {

      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_global_atomic_csub:

    case Intrinsic::amdgcn_global_atomic_fmin_num:

    case Intrinsic::amdgcn_global_atomic_fmax_num:

    case Intrinsic::amdgcn_flat_atomic_fmin_num:

    case Intrinsic::amdgcn_flat_atomic_fmax_num:

    case Intrinsic::amdgcn_atomic_cond_sub_u32:

    case Intrinsic::amdgcn_global_atomic_ordered_add_b64:

    case Intrinsic::amdgcn_global_load_tr_b64:

    case Intrinsic::amdgcn_global_load_tr_b128:

    case Intrinsic::amdgcn_global_load_tr4_b64:

    case Intrinsic::amdgcn_global_load_tr6_b96:

    case Intrinsic::amdgcn_ds_load_tr8_b64:

    case Intrinsic::amdgcn_ds_load_tr16_b128:

    case Intrinsic::amdgcn_ds_load_tr4_b64:

    case Intrinsic::amdgcn_ds_load_tr6_b96:

    case Intrinsic::amdgcn_flat_load_monitor_b32:

    case Intrinsic::amdgcn_flat_load_monitor_b64:

    case Intrinsic::amdgcn_flat_load_monitor_b128:

    case Intrinsic::amdgcn_global_load_monitor_b32:

    case Intrinsic::amdgcn_global_load_monitor_b64:

    case Intrinsic::amdgcn_global_load_monitor_b128:

    case Intrinsic::amdgcn_ds_read_tr4_b64:

    case Intrinsic::amdgcn_ds_read_tr6_b96:

    case Intrinsic::amdgcn_ds_read_tr8_b64:

    case Intrinsic::amdgcn_ds_read_tr16_b64:

    case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:

    case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:

      return getDefaultMappingAllVGPR(MI);

    case Intrinsic::amdgcn_ds_ordered_add:

    case Intrinsic::amdgcn_ds_ordered_swap: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

      unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,

                                 AMDGPU::SGPRRegBankID);

      OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      break;

    }

    case Intrinsic::amdgcn_ds_append:

    case Intrinsic::amdgcn_ds_consume: {

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_exp_compr:

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      break;

    case Intrinsic::amdgcn_exp:

      // FIXME: Could we support packed types here?

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      break;

    case Intrinsic::amdgcn_exp_row:

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

      OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);

      break;

    case Intrinsic::amdgcn_s_sendmsg:

    case Intrinsic::amdgcn_s_sendmsghalt: {

      // This must be an SGPR, but accept a VGPR.

      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,

                                   AMDGPU::SGPRRegBankID);

      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_s_setreg: {

      // This must be an SGPR, but accept a VGPR.

      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,

                                   AMDGPU::SGPRRegBankID);

      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_s_ttracedata: {

      // This must be an SGPR, but accept a VGPR.

      unsigned Bank =

          getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);

      OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_end_cf: {

      unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_else: {

      unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);

      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);

      break;

    }

    case Intrinsic::amdgcn_init_whole_wave:

    case Intrinsic::amdgcn_live_mask: {

      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

      break;

    }

    case Intrinsic::amdgcn_wqm_demote:

    case Intrinsic::amdgcn_kill: {

      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

      break;

    }

    case Intrinsic::amdgcn_raw_buffer_load:

    case Intrinsic::amdgcn_raw_ptr_buffer_load:

    case Intrinsic::amdgcn_raw_atomic_buffer_load:

    case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:

    case Intrinsic::amdgcn_raw_tbuffer_load:

    case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {

      // FIXME: Should make intrinsic ID the last operand of the instruction,

      // then this would be the same as store

      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_raw_buffer_load_lds:

    case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {

      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_raw_buffer_store:

    case Intrinsic::amdgcn_raw_ptr_buffer_store:

    case Intrinsic::amdgcn_raw_buffer_store_format:

    case Intrinsic::amdgcn_raw_ptr_buffer_store_format:

    case Intrinsic::amdgcn_raw_tbuffer_store:

    case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {

      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_struct_buffer_load:

    case Intrinsic::amdgcn_struct_ptr_buffer_load:

    case Intrinsic::amdgcn_struct_tbuffer_load:

    case Intrinsic::amdgcn_struct_ptr_tbuffer_load:

    case Intrinsic::amdgcn_struct_atomic_buffer_load:

    case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {

      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_struct_buffer_load_lds:

    case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {

      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);

      OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_struct_buffer_store:

    case Intrinsic::amdgcn_struct_ptr_buffer_store:

    case Intrinsic::amdgcn_struct_tbuffer_store:

    case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {

      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);

      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_init_exec_from_input: {

      unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);

      break;

    }

    case Intrinsic::amdgcn_ds_gws_init:

    case Intrinsic::amdgcn_ds_gws_barrier:

    case Intrinsic::amdgcn_ds_gws_sema_br: {

      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);


      // This must be an SGPR, but accept a VGPR.

      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,

                                   AMDGPU::SGPRRegBankID);

      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_ds_gws_sema_v:

    case Intrinsic::amdgcn_ds_gws_sema_p:

    case Intrinsic::amdgcn_ds_gws_sema_release_all: {

      // This must be an SGPR, but accept a VGPR.

      unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,

                                   AMDGPU::SGPRRegBankID);

      OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_global_store_async_from_lds_b8:

    case Intrinsic::amdgcn_global_store_async_from_lds_b32:

    case Intrinsic::amdgcn_global_store_async_from_lds_b64:

    case Intrinsic::amdgcn_global_store_async_from_lds_b128:

    case Intrinsic::amdgcn_global_load_async_to_lds_b8:

    case Intrinsic::amdgcn_global_load_async_to_lds_b32:

    case Intrinsic::amdgcn_global_load_async_to_lds_b64:

    case Intrinsic::amdgcn_global_load_async_to_lds_b128:

    case Intrinsic::amdgcn_load_to_lds:

    case Intrinsic::amdgcn_global_load_lds: {

      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_lds_direct_load: {

      const int M0Idx = MI.getNumOperands() - 1;

      Register M0Reg = MI.getOperand(M0Idx).getReg();

      unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);

      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();


      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);

      for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)

        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);


      // Must be SGPR, but we must take whatever the original bank is and fix it

      // later.

      OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);

      break;

    }

    case Intrinsic::amdgcn_ds_add_gs_reg_rtn:

    case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:

      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    case Intrinsic::amdgcn_ds_bvh_stack_rtn:

    case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:

    case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:

    case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {

      OpdsMapping[0] =

          getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst

      OpdsMapping[1] =

          getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr

      OpdsMapping[3] =

          getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr

      OpdsMapping[4] =

          getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0

      OpdsMapping[5] =

          getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1

      break;

    }

    case Intrinsic::amdgcn_s_sleep_var:

      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      break;

    case Intrinsic::amdgcn_s_barrier_join:

      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      break;

    case Intrinsic::amdgcn_s_barrier_init:

    case Intrinsic::amdgcn_s_barrier_signal_var:

      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    case Intrinsic::amdgcn_s_barrier_signal_isfirst: {

      const unsigned ResultSize = 1;

      OpdsMapping[0] =

          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);

      break;

    }

    case Intrinsic::amdgcn_s_get_barrier_state:

    case Intrinsic::amdgcn_s_get_named_barrier_state: {

      OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_pops_exiting_wave_id:

      return getDefaultMappingSOP(MI);

    case Intrinsic::amdgcn_tensor_load_to_lds_d2:

    case Intrinsic::amdgcn_tensor_store_from_lds_d2:

    case Intrinsic::amdgcn_tensor_load_to_lds:

    case Intrinsic::amdgcn_tensor_store_from_lds: {

      // Lie and claim everything is legal, even all operands need to be

      // SGPRs. applyMapping will have to deal with it with readfirstlane.

      for (unsigned I = 1; I < MI.getNumOperands(); ++I) {

        if (MI.getOperand(I).isReg()) {

          Register Reg = MI.getOperand(I).getReg();

          auto OpBank = getRegBankID(Reg, MRI);

          unsigned Size = getSizeInBits(Reg, MRI, *TRI);

          OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);

        }

      }

      break;

    }

    case Intrinsic::amdgcn_s_prefetch_data: {

      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

      break;

    }

    case Intrinsic::amdgcn_flat_prefetch:

    case Intrinsic::amdgcn_global_prefetch:

      return getDefaultMappingVOP(MI);

    default:

      return getInvalidInstructionMapping();

    }

    break;

  }

  case AMDGPU::G_SELECT: {

    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();

    unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,

                                    AMDGPU::SGPRRegBankID);

    unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,

                                    AMDGPU::SGPRRegBankID);

    bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&

                    Op3Bank == AMDGPU::SGPRRegBankID;


    unsigned CondBankDefault = SGPRSrcs ?

      AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;

    unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,

                                     CondBankDefault);

    if (CondBank == AMDGPU::SGPRRegBankID)

      CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;

    else if (CondBank == AMDGPU::VGPRRegBankID)

      CondBank = AMDGPU::VCCRegBankID;


    unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?

      AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;


    assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);


    // TODO: Should report 32-bit for scalar condition type.

    if (Size == 64) {

      OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);

      OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);

      OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);

      OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);

    } else {

      OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);

      OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);

      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);

      OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);

    }


    break;

  }


  case AMDGPU::G_SI_CALL: {

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);

    // Lie and claim everything is legal, even though some need to be

    // SGPRs. applyMapping will have to deal with it as a waterfall loop.

    OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);


    // Allow anything for implicit arguments

    for (unsigned I = 4; I < MI.getNumOperands(); ++I) {

      if (MI.getOperand(I).isReg()) {

        Register Reg = MI.getOperand(I).getReg();

        auto OpBank = getRegBankID(Reg, MRI);

        unsigned Size = getSizeInBits(Reg, MRI, *TRI);

        OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);

      }

    }

    break;

  }

  case AMDGPU::G_LOAD:

  case AMDGPU::G_ZEXTLOAD:

  case AMDGPU::G_SEXTLOAD:

    return getInstrMappingForLoad(MI);


  case AMDGPU::G_ATOMICRMW_XCHG:

  case AMDGPU::G_ATOMICRMW_ADD:

  case AMDGPU::G_ATOMICRMW_SUB:

  case AMDGPU::G_ATOMICRMW_AND:

  case AMDGPU::G_ATOMICRMW_OR:

  case AMDGPU::G_ATOMICRMW_XOR:

  case AMDGPU::G_ATOMICRMW_MAX:

  case AMDGPU::G_ATOMICRMW_MIN:

  case AMDGPU::G_ATOMICRMW_UMAX:

  case AMDGPU::G_ATOMICRMW_UMIN:

  case AMDGPU::G_ATOMICRMW_FADD:

  case AMDGPU::G_ATOMICRMW_FMIN:

  case AMDGPU::G_ATOMICRMW_FMAX:

  case AMDGPU::G_ATOMICRMW_UINC_WRAP:

  case AMDGPU::G_ATOMICRMW_UDEC_WRAP:

  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {

    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

    OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());

    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

    break;

  }

  case AMDGPU::G_ATOMIC_CMPXCHG: {

    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

    OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());

    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

    OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

    break;

  }

  case AMDGPU::G_BRCOND: {

    unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,

                                 AMDGPU::SGPRRegBankID);

    assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);

    if (Bank != AMDGPU::SGPRRegBankID)

      Bank = AMDGPU::VCCRegBankID;


    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);

    break;

  }

  case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:

    return getDefaultMappingVOP(MI);

  case AMDGPU::G_PREFETCH:

    OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

    break;

  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:

  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:

    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);

    break;

  }


  return getInstructionMapping(/*ID*/1, /*Cost*/1,

                               getOperandsMapping(OpdsMapping),

                               MI.getNumOperands());

}

MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:103

getIntrinsicID
static unsigned getIntrinsicID(const SDNode *N)
Definition: AArch64ISelLowering.cpp:7799

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUGlobalISelUtils.h

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

S16
constexpr LLT S16
Definition: AMDGPULegalizerInfo.cpp:296

S1
constexpr LLT S1
Definition: AMDGPULegalizerInfo.cpp:294

S32
constexpr LLT S32
Definition: AMDGPULegalizerInfo.cpp:297

S64
constexpr LLT S64
Definition: AMDGPULegalizerInfo.cpp:299

Select
AMDGPU Register Bank Select
Definition: AMDGPURegBankSelect.cpp:68

substituteSimpleCopyRegs
static bool substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx)
Definition: AMDGPURegisterBankInfo.cpp:1771

regBankBoolUnion
static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1)
Definition: AMDGPURegisterBankInfo.cpp:3570

getBaseWithConstantOffset
static std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Definition: AMDGPURegisterBankInfo.cpp:1810

constrainRegToBank
static Register constrainRegToBank(MachineRegisterInfo &MRI, MachineIRBuilder &B, Register &Reg, const RegisterBank &Bank)
Definition: AMDGPURegisterBankInfo.cpp:2033

unpackV2S16ToS32
static std::pair< Register, Register > unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode)
Definition: AMDGPURegisterBankInfo.cpp:1749

extendLow32IntoHigh32
static void extendLow32IntoHigh32(MachineIRBuilder &B, Register Hi32Reg, Register Lo32Reg, unsigned ExtOpc, const RegisterBank &RegBank, bool IsBooleanSrc=false)
Implement extending a 32-bit value to a 64-bit value.
Definition: AMDGPURegisterBankInfo.cpp:1925

getExtendOp
static unsigned getExtendOp(unsigned Opc)
Definition: AMDGPURegisterBankInfo.cpp:1731

isVectorRegisterBank
static bool isVectorRegisterBank(const RegisterBank &Bank)
Definition: AMDGPURegisterBankInfo.cpp:221

regBankUnion
static unsigned regBankUnion(unsigned RB0, unsigned RB1)
Definition: AMDGPURegisterBankInfo.cpp:3555

splitUnequalType
static std::pair< LLT, LLT > splitUnequalType(LLT Ty, unsigned FirstSize)
Split Ty into 2 pieces.
Definition: AMDGPURegisterBankInfo.cpp:1029

setRegsToType
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef< Register > Regs, LLT NewTy)
Replace the current type each register in Regs has with NewTy.
Definition: AMDGPURegisterBankInfo.cpp:682

reinsertVectorIndexAdd
static void reinsertVectorIndexAdd(MachineIRBuilder &B, MachineInstr &IdxUseInstr, unsigned OpIdx, unsigned ConstOffset)
Utility function for pushing dynamic vector indexes with a constant offset into waterfall loops.
Definition: AMDGPURegisterBankInfo.cpp:1904

widen96To128
static LLT widen96To128(LLT Ty)
Definition: AMDGPURegisterBankInfo.cpp:1045

getHalfSizedType
static LLT getHalfSizedType(LLT Ty)
Definition: AMDGPURegisterBankInfo.cpp:690

getSBufferLoadCorrespondingBufferLoadOpcode
static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc)
Definition: AMDGPURegisterBankInfo.cpp:1339

AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.

PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:98

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition: ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: ARMSLSHardening.cpp:72

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27

Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition: DeadArgumentElimination.cpp:347

Size
uint64_t Size
Definition: ELFObjHandler.cpp:81

End
bool End
Definition: ELF_riscv.cpp:480

X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

GenericMachineInstrs.h
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:110

LegalizerHelper.h

I
#define I(x, y, z)
Definition: MD5.cpp:58

MIPatternMatch.h
Contains matchers for matching SSA Machine Instructions.

Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74

MachineIRBuilder.h
This file declares the MachineIRBuilder class.

TRI
Register const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:2118

isReg
static bool isReg(const MCInst &MI, unsigned OpNo)
Definition: MipsInstPrinter.cpp:32

OpIdx
MachineInstr unsigned OpIdx
Definition: NVPTXPrologEpilogPass.cpp:56

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")

SPReg
static constexpr MCPhysReg SPReg
Definition: RISCVFrameLowering.cpp:54

Opc
auto Opc
Definition: RISCVRedundantCopyElimination.cpp:75

RegisterBank.h

SIMachineFunctionInfo.h

SIRegisterInfo.h
Interface definition for SIRegisterInfo.

T

llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42

llvm::AMDGPURegisterBankInfo::applyMappingDynStackAlloc
bool applyMappingDynStackAlloc(MachineIRBuilder &B, const OperandsMapper &OpdMapper, MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:1172

llvm::AMDGPURegisterBankInfo::splitBufferOffsets
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register Offset) const
Definition: AMDGPURegisterBankInfo.cpp:1824

llvm::AMDGPURegisterBankInfo::collectWaterfallOperands
bool collectWaterfallOperands(SmallSet< Register, 4 > &SGPROperandRegs, MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef< unsigned > OpIndices) const
Definition: AMDGPURegisterBankInfo.cpp:985

llvm::AMDGPURegisterBankInfo::getImageMapping
const InstructionMapping & getImageMapping(const MachineRegisterInfo &MRI, const MachineInstr &MI, int RsrcIdx) const
Definition: AMDGPURegisterBankInfo.cpp:3681

llvm::AMDGPURegisterBankInfo::addMappingFromTable
InstructionMappings addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI, const std::array< unsigned, NumOps > RegSrcOpIdx, ArrayRef< OpRegBankEntry< NumOps > > Table) const

llvm::AMDGPURegisterBankInfo::copyCost
unsigned copyCost(const RegisterBank &A, const RegisterBank &B, TypeSize Size) const override
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
Definition: AMDGPURegisterBankInfo.cpp:230

llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:386

llvm::AMDGPURegisterBankInfo::buildVCopy
bool buildVCopy(MachineIRBuilder &B, Register DstReg, Register SrcReg) const
Definition: AMDGPURegisterBankInfo.cpp:1869

llvm::AMDGPURegisterBankInfo::executeInWaterfallLoop
bool executeInWaterfallLoop(MachineIRBuilder &B, iterator_range< MachineBasicBlock::iterator > Range, SmallSet< Register, 4 > &SGPROperandRegs) const
Legalize instruction MI where operands in OpIndices must be SGPRs.
Definition: AMDGPURegisterBankInfo.cpp:775

llvm::AMDGPURegisterBankInfo::getRegBankFromRegClass
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
Definition: AMDGPURegisterBankInfo.cpp:287

llvm::AMDGPURegisterBankInfo::AMDGPURegisterBankInfo
AMDGPURegisterBankInfo(const GCNSubtarget &STI)
Definition: AMDGPURegisterBankInfo.cpp:204

llvm::AMDGPURegisterBankInfo::applyMappingMAD_64_32
bool applyMappingMAD_64_32(MachineIRBuilder &B, const OperandsMapper &OpdMapper) const
Definition: AMDGPURegisterBankInfo.cpp:1581

llvm::AMDGPURegisterBankInfo::getRegBankID
unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI, unsigned Default=AMDGPU::VGPRRegBankID) const
Definition: AMDGPURegisterBankInfo.cpp:3787

llvm::AMDGPURegisterBankInfo::handleD16VData
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const
Handle register layout difference for f16 images for some subtargets.
Definition: AMDGPURegisterBankInfo.cpp:1784

llvm::AMDGPURegisterBankInfo::getInstrMappingForLoad
const RegisterBankInfo::InstructionMapping & getInstrMappingForLoad(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3740

llvm::AMDGPURegisterBankInfo::applyMappingImpl
void applyMappingImpl(MachineIRBuilder &Builder, const OperandsMapper &OpdMapper) const override
See RegisterBankInfo::applyMapping.
Definition: AMDGPURegisterBankInfo.cpp:2204

llvm::AMDGPURegisterBankInfo::applyMappingBFE
bool applyMappingBFE(MachineIRBuilder &B, const OperandsMapper &OpdMapper, bool Signed) const
Definition: AMDGPURegisterBankInfo.cpp:1475

llvm::AMDGPURegisterBankInfo::applyMappingImage
bool applyMappingImage(MachineIRBuilder &B, MachineInstr &MI, const OperandsMapper &OpdMapper, int RSrcIdx) const
Definition: AMDGPURegisterBankInfo.cpp:1226

llvm::AMDGPURegisterBankInfo::getVGPROpMapping
const ValueMapping * getVGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3806

llvm::AMDGPURegisterBankInfo::TII
const SIInstrInfo * TII
Definition: AMDGPURegisterBankInfo.h:46

llvm::AMDGPURegisterBankInfo::isScalarLoadLegal
bool isScalarLoadLegal(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:444

llvm::AMDGPURegisterBankInfo::setBufferOffsets
unsigned setBufferOffsets(MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const
Definition: AMDGPURegisterBankInfo.cpp:1256

llvm::AMDGPURegisterBankInfo::getSGPROpMapping
const ValueMapping * getSGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3795

llvm::AMDGPURegisterBankInfo::applyMappingLoad
bool applyMappingLoad(MachineIRBuilder &B, const OperandsMapper &OpdMapper, MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:1054

llvm::AMDGPURegisterBankInfo::split64BitValueForMapping
void split64BitValueForMapping(MachineIRBuilder &B, SmallVector< Register, 2 > &Regs, LLT HalfTy, Register Reg) const
Split 64-bit value Reg into two 32-bit halves and populate them into Regs.
Definition: AMDGPURegisterBankInfo.cpp:659

llvm::AMDGPURegisterBankInfo::getValueMappingForPtr
const ValueMapping * getValueMappingForPtr(const MachineRegisterInfo &MRI, Register Ptr) const
Return the mapping for a pointer argument.
Definition: AMDGPURegisterBankInfo.cpp:3725

llvm::AMDGPURegisterBankInfo::getMappingType
unsigned getMappingType(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3586

llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsic(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:346

llvm::AMDGPURegisterBankInfo::isDivergentRegBank
bool isDivergentRegBank(const RegisterBank *RB) const override
Returns true if the register bank is considered divergent.
Definition: AMDGPURegisterBankInfo.cpp:226

llvm::AMDGPURegisterBankInfo::constrainOpWithReadfirstlane
void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const
Definition: AMDGPURegisterBankInfo.cpp:1015

llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappings
InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override
Get the alternative mappings for MI.
Definition: AMDGPURegisterBankInfo.cpp:470

llvm::AMDGPURegisterBankInfo::getDefaultMappingSOP
const InstructionMapping & getDefaultMappingSOP(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3620

llvm::AMDGPURegisterBankInfo::getDefaultMappingAllVGPR
const InstructionMapping & getDefaultMappingAllVGPR(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3662

llvm::AMDGPURegisterBankInfo::getInstrMapping
const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override
This function must return a legal mapping, because AMDGPURegisterBankInfo::getInstrAlternativeMapping...
Definition: AMDGPURegisterBankInfo.cpp:3831

llvm::AMDGPURegisterBankInfo::getBreakDownCost
unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank=nullptr) const override
Get the cost of using ValMapping to decompose a register.
Definition: AMDGPURegisterBankInfo.cpp:261

llvm::AMDGPURegisterBankInfo::getAGPROpMapping
const ValueMapping * getAGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3814

llvm::AMDGPURegisterBankInfo::Subtarget
const GCNSubtarget & Subtarget
Definition: AMDGPURegisterBankInfo.h:44

llvm::AMDGPURegisterBankInfo::getDefaultMappingVOP
const InstructionMapping & getDefaultMappingVOP(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3638

llvm::AMDGPURegisterBankInfo::isSALUMapping
bool isSALUMapping(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3604

llvm::AMDGPURegisterBankInfo::buildReadFirstLane
Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Src) const
Definition: AMDGPURegisterBankInfo.cpp:703

llvm::AMDGPURegisterBankInfo::applyMappingSBufferLoad
bool applyMappingSBufferLoad(MachineIRBuilder &B, const OperandsMapper &OpdMapper) const
Definition: AMDGPURegisterBankInfo.cpp:1357

llvm::AMDGPURegisterBankInfo::applyMappingSMULU64
void applyMappingSMULU64(MachineIRBuilder &B, const OperandsMapper &OpdMapper) const
Definition: AMDGPURegisterBankInfo.cpp:2137

llvm::AMDGPURegisterBankInfo::TRI
const SIRegisterInfo * TRI
Definition: AMDGPURegisterBankInfo.h:45

llvm::AMDGPUSubtarget::hasSMulHi
bool hasSMulHi() const
Definition: AMDGPUSubtarget.h:283

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition: AMDGPUSubtarget.h:44

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:699

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:700

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:33

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:124

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177

llvm::DenseMapBase::end
iterator end()
Definition: DenseMap.h:87

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230

llvm::DenseMap
Definition: DenseMap.h:730

llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312

llvm::GCNSubtarget
Definition: GCNSubtarget.h:34

llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:1081

llvm::GCNSubtarget::hasSafeSmemPrefetch
bool hasSafeSmemPrefetch() const
Definition: GCNSubtarget.h:1015

llvm::GCNSubtarget::hasScalarSubwordLoads
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:500

llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:420

llvm::GCNSubtarget::hasIntMinMax64
bool hasIntMinMax64() const
Definition: GCNSubtarget.h:1563

llvm::GCNSubtarget::hasVmemPrefInsts
bool hasVmemPrefInsts() const
Definition: GCNSubtarget.h:1013

llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1744

llvm::GCNSubtarget::hasScalarDwordx3Loads
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:1085

llvm::GCNSubtarget::hasVectorMulU64
bool hasVectorMulU64() const
Definition: GCNSubtarget.h:1556

llvm::GCNSubtarget::hasScalarMulHiInsts
bool hasScalarMulHiInsts() const
Definition: GCNSubtarget.h:496

llvm::GCNSubtarget::hasPseudoScalarTrans
bool hasPseudoScalarTrans() const
Definition: GCNSubtarget.h:1397

llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:576

llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:356

llvm::GCNSubtarget::hasUnpackedD16VMem
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:787

llvm::GCNSubtarget::hasSALUFloatInsts
bool hasSALUFloatInsts() const
Definition: GCNSubtarget.h:1395

llvm::GISelChangeObserver
Abstract class that contains various methods for clients to notify about changes.
Definition: GISelChangeObserver.h:30

llvm::GISelChangeObserver::changingInstr
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.

llvm::GISelChangeObserver::changedInstr
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.

llvm::GISelChangeObserver::createdInstr
virtual void createdInstr(MachineInstr &MI)=0
An instruction has been created and inserted into the function.

llvm::GISelChangeObserver::erasingInstr
virtual void erasingInstr(MachineInstr &MI)=0
An instruction is about to be erased.

llvm::LLT
Definition: LowLevelType.h:40

llvm::LLT::getScalarSizeInBits
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:265

llvm::LLT::isScalar
constexpr bool isScalar() const
Definition: LowLevelType.h:147

llvm::LLT::scalar
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43

llvm::LLT::isValid
constexpr bool isValid() const
Definition: LowLevelType.h:146

llvm::LLT::getNumElements
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:160

llvm::LLT::isVector
constexpr bool isVector() const
Definition: LowLevelType.h:149

llvm::LLT::getSizeInBits
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:191

llvm::LLT::getElementType
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:278

llvm::LLT::getElementCount
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184

llvm::LLT::getAddressSpace
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:271

llvm::LLT::fixed_vector
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:101

llvm::LLT::getScalarType
constexpr LLT getScalarType() const
Definition: LowLevelType.h:206

llvm::LLT::scalarOrVector
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:125

llvm::LLT::divide
constexpr LLT divide(int Factor) const
Return a type that is Factor times smaller.
Definition: LowLevelType.h:235

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68

llvm::LegalizerHelper
Definition: LegalizerHelper.h:49

llvm::LegalizerHelper::widenScalarSrc
LLVM_ABI void widenScalarSrc(MachineInstr &MI, LLT WideTy, unsigned OpIdx, unsigned ExtOpcode)
Legalize a single operand OpIdx of the machine instruction MI as a Use by extending the operand's typ...
Definition: LegalizerHelper.cpp:2082

llvm::LegalizerHelper::lowerAbsToMaxNeg
LLVM_ABI LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI)
Definition: LegalizerHelper.cpp:9519

llvm::LegalizerHelper::narrowScalar
LLVM_ABI LegalizeResult narrowScalar(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize an instruction by reducing the width of the underlying scalar type.
Definition: LegalizerHelper.cpp:1483

llvm::LegalizerHelper::reduceLoadStoreWidth
LLVM_ABI LegalizeResult reduceLoadStoreWidth(GLoadStore &MI, unsigned TypeIdx, LLT NarrowTy)
Definition: LegalizerHelper.cpp:5293

llvm::LegalizerHelper::Legalized
@ Legalized
Instruction has been legalized and the MachineFunction changed.
Definition: LegalizerHelper.h:71

llvm::LegalizerHelper::fewerElementsVector
LLVM_ABI LegalizeResult fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize a vector instruction by splitting into multiple components, each acting on the same scalar t...
Definition: LegalizerHelper.cpp:5394

llvm::LegalizerHelper::widenScalar
LLVM_ABI LegalizeResult widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy)
Legalize an instruction by performing the operation on a wider scalar type (for example a 16-bit addi...
Definition: LegalizerHelper.cpp:2673

llvm::LegalizerHelper::widenScalarDst
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
Definition: LegalizerHelper.cpp:2096

llvm::LocationSize::getValue
TypeSize getValue() const
Definition: MemoryLocation.h:157

llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:122

llvm::MachineBasicBlock::transferSuccessorsAndUpdatePHIs
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
Definition: MachineBasicBlock.cpp:935

llvm::MachineBasicBlock::getFirstTerminator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Definition: MachineBasicBlock.cpp:242

llvm::MachineBasicBlock::addSuccessor
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition: MachineBasicBlock.cpp:796

llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:377

llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:379

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:323

llvm::MachineBasicBlock::splice
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Definition: MachineBasicBlock.h:1149

llvm::MachineFunction
Definition: MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:762

llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:536

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:772

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:860

llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Definition: MachineFunction.cpp:499

llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition: MachineFunction.h:1003

llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition: MachineIRBuilder.h:236

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:126

llvm::MachineInstrBundleIterator< MachineInstr >

llvm::MachineInstrSpan
MachineInstrSpan provides an interface to get an iteration range containing the instruction it was in...
Definition: MachineBasicBlock.h:1436

llvm::MachineInstrSpan::begin
MachineBasicBlock::iterator begin()
Definition: MachineBasicBlock.h:1447

llvm::MachineInstrSpan::end
MachineBasicBlock::iterator end()
Definition: MachineBasicBlock.h:1450

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:72

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:130

llvm::MachineMemOperand::getSize
LocationSize getSize() const
Return the size in bytes of the memory reference.
Definition: MachineMemOperand.h:243

llvm::MachineMemOperand::isVolatile
bool isVolatile() const
Definition: MachineMemOperand.h:301

llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:236

llvm::MachineMemOperand::isAtomic
bool isAtomic() const
Returns true if this operation has an atomic ordering requirement of unordered or higher,...
Definition: MachineMemOperand.h:308

llvm::MachineMemOperand::isInvariant
bool isInvariant() const
Definition: MachineMemOperand.h:304

llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition: MachineMemOperand.h:145

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition: MachineMemOperand.h:137

llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition: MachineMemOperand.h:147

llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition: MachineMemOperand.h:227

llvm::MachineMemOperand::getAlign
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1138

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition: MachineOperand.cpp:60

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:368

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:53

llvm::RegisterBankInfo::InstructionMapping
Helper class that represents how the value of an instruction may be mapped and what is the related co...
Definition: RegisterBankInfo.h:191

llvm::RegisterBankInfo::InstructionMapping::isValid
bool isValid() const
Check whether this object is valid.
Definition: RegisterBankInfo.h:255

llvm::RegisterBankInfo::OperandsMapper
Helper class used to get/create the virtual registers that will be used to replace the MachineOperand...
Definition: RegisterBankInfo.h:281

llvm::RegisterBankInfo::OperandsMapper::getInstrMapping
const InstructionMapping & getInstrMapping() const
The final mapping of the instruction.
Definition: RegisterBankInfo.h:333

llvm::RegisterBankInfo::OperandsMapper::getMI
MachineInstr & getMI() const
Definition: RegisterBankInfo.h:330

llvm::RegisterBankInfo::OperandsMapper::getMRI
MachineRegisterInfo & getMRI() const
The MachineRegisterInfo we used to realize the mapping.
Definition: RegisterBankInfo.h:336

llvm::RegisterBankInfo::OperandsMapper::getVRegs
iterator_range< SmallVectorImpl< Register >::const_iterator > getVRegs(unsigned OpIdx, bool ForDebug=false) const
Get all the virtual registers required to map the OpIdx-th operand of the instruction.
Definition: RegisterBankInfo.cpp:745

llvm::RegisterBankInfo::getInstrAlternativeMappings
virtual InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const
Get the alternative mappings for MI.
Definition: RegisterBankInfo.cpp:433

llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:131

llvm::RegisterBankInfo::getInstructionMapping
const InstructionMapping & getInstructionMapping(unsigned ID, unsigned Cost, const ValueMapping *OperandsMapping, unsigned NumOperands) const
Method to get a uniquely generated InstructionMapping.
Definition: RegisterBankInfo.h:534

llvm::RegisterBankInfo::applyDefaultMapping
static void applyDefaultMapping(const OperandsMapper &OpdMapper)
Helper method to apply something that is like the default mapping.
Definition: RegisterBankInfo.cpp:438

llvm::RegisterBankInfo::getValueMapping
const ValueMapping & getValueMapping(unsigned StartIdx, unsigned Length, const RegisterBank &RegBank) const
The most common ValueMapping consists of a single PartialMapping.
Definition: RegisterBankInfo.cpp:297

llvm::RegisterBankInfo::getInvalidInstructionMapping
const InstructionMapping & getInvalidInstructionMapping() const
Method to get a uniquely generated invalid InstructionMapping.
Definition: RegisterBankInfo.h:542

llvm::RegisterBankInfo::getRegBank
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:440

llvm::RegisterBankInfo::Sizes
const unsigned * Sizes
Hold the sizes of the register banks for all HwModes.
Definition: RegisterBankInfo.h:395

llvm::RegisterBankInfo::cannotCopy
bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src, TypeSize Size) const
Definition: RegisterBankInfo.h:643

llvm::RegisterBankInfo::getSizeInBits
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:497

llvm::RegisterBankInfo::getOperandsMapping
const ValueMapping * getOperandsMapping(Iterator Begin, Iterator End) const
Get the uniquely generated array of ValueMapping for the elements of between Begin and End.
Definition: RegisterBankInfo.cpp:332

llvm::RegisterBankInfo::copyCost
virtual unsigned copyCost(const RegisterBank &A, const RegisterBank &B, TypeSize Size) const
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
Definition: RegisterBankInfo.h:633

llvm::RegisterBankInfo::getInstrMappingImpl
const InstructionMapping & getInstrMappingImpl(const MachineInstr &MI) const
Try to get the mapping of MI.
Definition: RegisterBankInfo.cpp:161

llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:29

llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:46

llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19

llvm::Register::isVirtual
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74

llvm::SIInstrInfo::splitMUBUFOffset
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
Definition: SIInstrInfo.cpp:9701

llvm::SIInstrInfo::getMaxMUBUFImmOffset
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:9661

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:412

llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:380

llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition: SIRegisterInfo.h:221

llvm::SIRegisterInfo::isAGPRClass
static bool isAGPRClass(const TargetRegisterClass *RC)
Definition: SIRegisterInfo.h:245

llvm::SITargetLowering::shouldExpandVectorDynExt
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
Definition: SIISelLowering.cpp:14420

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134

llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176

llvm::SmallSet::empty
bool empty() const
Definition: SmallSet.h:169

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182

llvm::SmallVectorBase::empty
bool empty() const
Definition: SmallVector.h:82

llvm::SmallVectorBase::size
size_t size() const
Definition: SmallVector.h:79

llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:611

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition: SmallVector.h:639

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:414

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197

llvm::SrcOp
Definition: MachineIRBuilder.h:143

llvm::SrcOp::getReg
Register getReg() const
Definition: MachineIRBuilder.h:195

llvm::TargetFrameLowering::StackGrowsUp
@ StackGrowsUp
Definition: TargetFrameLowering.h:49

llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:237

llvm::TypeSize
Definition: TypeSize.h:335

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)

llvm::details::FixedOrScalableQuantity::isKnownMultipleOf
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition: TypeSize.h:184

llvm::details::FixedOrScalableQuantity::divideCoefficientBy
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:255

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:134

llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:42

uint32_t

uint64_t

unsigned

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:164

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPUAddrSpace.h:38

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPUAddrSpace.h:35

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::BUFFER_RESOURCE
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition: AMDGPUAddrSpace.h:43

llvm::AMDGPU::isFlatGlobalAddrSpace
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPUAddrSpace.h:86

llvm::AMDGPU::isUniformMMO
bool isUniformMMO(const MachineMemOperand *MMO)
Definition: AMDGPUInstrInfo.cpp:30

llvm::AMDGPU::isExtendedGlobalAddrSpace
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPUAddrSpace.h:91

llvm::AMDGPU::getIntrinsicID
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
Definition: AMDGPUInstrInfo.cpp:25

llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
Definition: AMDGPUGlobalISelUtils.cpp:26

llvm::AMDGPU::lookupRsrcIntrinsic
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)

llvm::M68k::MemAddrModeKind::L
@ L

llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:310

llvm::MIPatternMatch::m_SpecificICstOrSplat
SpecificConstantOrSplatMatch m_SpecificICstOrSplat(APInt RequestedValue)
Matches a RequestedValue constant or a constant splat of RequestedValue.
Definition: MIPatternMatch.h:266

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition: MIPatternMatch.h:277

llvm::MIPatternMatch::m_ICst
ConstantMatch< APInt > m_ICst(APInt &Cst)
Definition: MIPatternMatch.h:102

llvm::MIPatternMatch::m_GAdd
BinaryOp_match< LHS, RHS, TargetOpcode::G_ADD, true > m_GAdd(const LHS &L, const RHS &R)
Definition: MIPatternMatch.h:550

llvm::MIPatternMatch::mi_match
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
Definition: MIPatternMatch.h:28

llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:51

llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp

llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::Offset
@ Offset
Definition: DWP.cpp:477

llvm::getOpcodeDef
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:651

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:369

llvm::constrainSelectedInstRegOperands
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:77

llvm::getIConstantVRegSExtVal
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314

llvm::MONoClobber
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:44

llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428

llvm::PackElem::Hi
@ Hi

llvm::PackElem::Lo
@ Lo

llvm::LEB128Sign::Signed
@ Signed

llvm::RecurKind::Add
@ Add
Sum of integers.

llvm::call_once
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:86

llvm::getIConstantVRegValWithLookThrough
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433

llvm::assumeAligned
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
Definition: Alignment.h:111

llvm::Log2
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208

llvm::getSrcRegIgnoringCopies
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:499

llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.

N
#define N

llvm::AMDGPURegisterBankInfo::OpRegBankEntry
Definition: AMDGPURegisterBankInfo.h:132

llvm::AMDGPU::RsrcIntrinsic
Definition: AMDGPUInstrInfo.h:37

llvm::AMDGPU::RsrcIntrinsic::RsrcArg
uint8_t RsrcArg
Definition: AMDGPUInstrInfo.h:39

llvm::AMDGPU::RsrcIntrinsic::IsImage
bool IsImage
Definition: AMDGPUInstrInfo.h:40

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:42

llvm::RegisterBankInfo::PartialMapping::StartIdx
unsigned StartIdx
Number of bits at which this partial mapping starts in the original value.
Definition: RegisterBankInfo.h:53

llvm::RegisterBankInfo::PartialMapping::RegBank
const RegisterBank * RegBank
Register bank where the partial value lives.
Definition: RegisterBankInfo.h:61

llvm::RegisterBankInfo::PartialMapping::Length
unsigned Length
Length of this mapping in bits.
Definition: RegisterBankInfo.h:58

llvm::RegisterBankInfo::ValueMapping
Helper struct that represents how a value is mapped through different register banks.
Definition: RegisterBankInfo.h:146

llvm::RegisterBankInfo::ValueMapping::NumBreakDowns
unsigned NumBreakDowns
Number of partial mapping to break down this value.
Definition: RegisterBankInfo.h:151

llvm::RegisterBankInfo::ValueMapping::BreakDown
const PartialMapping * BreakDown
How the value is broken down between the different register banks.
Definition: RegisterBankInfo.h:148

llvm::once_flag
The llvm::once_flag structure.
Definition: Threading.h:67