LLVM: lib/Target/AArch64/AArch64SIMDInstrOpt.cpp Source File

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file contains a pass that performs optimization on SIMD instructions

// with high latency by splitting them into more efficient series of

// instructions.

//

// 1. Rewrite certain SIMD instructions with vector element due to their

// inefficiency on some targets.

//

// For example:

//    fmla v0.4s, v1.4s, v2.s[1]

//

// Is rewritten into:

//    dup v3.4s, v2.s[1]

//    fmla v0.4s, v1.4s, v3.4s

//

// 2. Rewrite interleaved memory access instructions due to their

// inefficiency on some targets.

//

// For example:

//    st2 {v0.4s, v1.4s}, addr

//

// Is rewritten into:

//    zip1 v2.4s, v0.4s, v1.4s

//    zip2 v3.4s, v0.4s, v1.4s

//    stp  q2, q3,  addr

//

//===----------------------------------------------------------------------===//


#include "AArch64InstrInfo.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringRef.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/CodeGen/MachineInstr.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineOperand.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/TargetInstrInfo.h"

#include "llvm/CodeGen/TargetSchedule.h"

#include "llvm/CodeGen/TargetSubtargetInfo.h"

#include "llvm/MC/MCInstrDesc.h"

#include "llvm/MC/MCSchedule.h"

#include "llvm/Pass.h"

#include <unordered_map>

#include <map>


using namespace llvm;


#define DEBUG_TYPE "aarch64-simdinstr-opt"


STATISTIC(NumModifiedInstr,

          "Number of SIMD instructions modified");


#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \

  "AArch64 SIMD instructions optimization pass"


namespace {


struct AArch64SIMDInstrOpt : public MachineFunctionPass {

  static char ID;


  const TargetInstrInfo *TII;

  MachineRegisterInfo *MRI;

  TargetSchedModel SchedModel;


  // The two maps below are used to cache decisions instead of recomputing:

  // This is used to cache instruction replacement decisions within function

  // units and across function units.

  std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;

  // This is used to cache the decision of whether to leave the interleaved

  // store instructions replacement pass early or not for a particular target.

  std::unordered_map<std::string, bool> InterlEarlyExit;


  typedef enum {

    VectorElem,

    Interleave

  } Subpass;


  // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.

  struct InstReplInfo {

    unsigned OrigOpc;

    std::vector<unsigned> ReplOpc;

    const TargetRegisterClass RC;

  };


#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \

  {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}

#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \

                OpcR7, OpcR8, OpcR9, RC) \

  {OpcOrg, \

   {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}


  // The Instruction Replacement Table:

  std::vector<InstReplInfo> IRT = {

    // ST2 instructions

    RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,

          AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,

          AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,

          AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,

          AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,

          AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,

          AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,

          AArch64::STPDi, AArch64::FPR64RegClass),

    // ST4 instructions

    RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,

          AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,

          AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,

          AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,

          AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,

          AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,

          AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,

          AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,

          AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,

          AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,

          AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,

          AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,

          AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,

          AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,

          AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,

          AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),

    RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,

          AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,

          AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,

          AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),

    RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,

          AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,

          AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,

          AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)

  };


  // A costly instruction is replaced in this work by N efficient instructions

  // The maximum of N is currently 10 and it is for ST4 case.

  static const unsigned MaxNumRepl = 10;


  AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {}


  /// Based only on latency of instructions, determine if it is cost efficient

  /// to replace the instruction InstDesc by the instructions stored in the

  /// array InstDescRepl.

  /// Return true if replacement is expected to be faster.

  bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,

                         SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);


  /// Determine if we need to exit the instruction replacement optimization

  /// passes early. This makes sure that no compile time is spent in this pass

  /// for targets with no need for any of these optimizations.

  /// Return true if early exit of the pass is recommended.

  bool shouldExitEarly(MachineFunction *MF, Subpass SP);


  /// Check whether an equivalent DUP instruction has already been

  /// created or not.

  /// Return true when the DUP instruction already exists. In this case,

  /// DestReg will point to the destination of the already created DUP.

  bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,

                unsigned LaneNumber, unsigned *DestReg) const;


  /// Certain SIMD instructions with vector element operand are not efficient.

  /// Rewrite them into SIMD instructions with vector operands. This rewrite

  /// is driven by the latency of the instructions.

  /// Return true if the SIMD instruction is modified.

  bool optimizeVectElement(MachineInstr &MI);


  /// Process The REG_SEQUENCE instruction, and extract the source

  /// operands of the ST2/4 instruction from it.

  /// Example of such instructions.

  ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;

  /// Return true when the instruction is processed successfully.

  bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,

                         unsigned* StRegKill, unsigned NumArg) const;


  /// Load/Store Interleaving instructions are not always beneficial.

  /// Replace them by ZIP instructionand classical load/store.

  /// Return true if the SIMD instruction is modified.

  bool optimizeLdStInterleave(MachineInstr &MI);


  /// Return the number of useful source registers for this

  /// instruction (2 for ST2 and 4 for ST4).

  unsigned determineSrcReg(MachineInstr &MI) const;


  bool runOnMachineFunction(MachineFunction &Fn) override;


  StringRef getPassName() const override {

    return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;

  }

};


char AArch64SIMDInstrOpt::ID = 0;


} // end anonymous namespace


INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",

                AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)


/// Based only on latency of instructions, determine if it is cost efficient

/// to replace the instruction InstDesc by the instructions stored in the

/// array InstDescRepl.

/// Return true if replacement is expected to be faster.

bool AArch64SIMDInstrOpt::

shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,

                  SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {

  // Check if replacement decision is already available in the cached table.

  // if so, return it.

  std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());

  auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);

  auto It = SIMDInstrTable.find(InstID);

  if (It != SIMDInstrTable.end())

    return It->second;


  unsigned SCIdx = InstDesc->getSchedClass();

  const MCSchedClassDesc *SCDesc =

    SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);


  // If a target does not define resources for the instructions

  // of interest, then return false for no replacement.

  const MCSchedClassDesc *SCDescRepl;

  if (!SCDesc->isValid() || SCDesc->isVariant())

  {

    SIMDInstrTable[InstID] = false;

    return false;

  }

  for (const auto *IDesc : InstDescRepl)

  {

    SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(

      IDesc->getSchedClass());

    if (!SCDescRepl->isValid() || SCDescRepl->isVariant())

    {

      SIMDInstrTable[InstID] = false;

      return false;

    }

  }


  // Replacement cost.

  unsigned ReplCost = 0;

  for (const auto *IDesc :InstDescRepl)

    ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());


  if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)

  {

    SIMDInstrTable[InstID] = true;

    return true;

  }

  else

  {

    SIMDInstrTable[InstID] = false;

    return false;

  }

}


/// Determine if we need to exit this pass for a kind of instruction replacement

/// early. This makes sure that no compile time is spent in this pass for

/// targets with no need for any of these optimizations beyond performing this

/// check.

/// Return true if early exit of this pass for a kind of instruction

/// replacement is recommended for a target.

bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {

  const MCInstrDesc* OriginalMCID;

  SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;


  switch (SP) {

  // For this optimization, check by comparing the latency of a representative

  // instruction to that of the replacement instructions.

  // TODO: check for all concerned instructions.

  case VectorElem:

    OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);

    ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));

    ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));

    if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))

      return false;

    break;


  // For this optimization, check for all concerned instructions.

  case Interleave:

    std::string Subtarget =

        std::string(SchedModel.getSubtargetInfo()->getCPU());

    auto It = InterlEarlyExit.find(Subtarget);

    if (It != InterlEarlyExit.end())

      return It->second;


    for (auto &I : IRT) {

      OriginalMCID = &TII->get(I.OrigOpc);

      for (auto &Repl : I.ReplOpc)

        ReplInstrMCID.push_back(&TII->get(Repl));

      if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {

        InterlEarlyExit[Subtarget] = false;

        return false;

      }

      ReplInstrMCID.clear();

    }

    InterlEarlyExit[Subtarget] = true;

    break;

  }


  return true;

}


/// Check whether an equivalent DUP instruction has already been

/// created or not.

/// Return true when the DUP instruction already exists. In this case,

/// DestReg will point to the destination of the already created DUP.

bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,

                                         unsigned SrcReg, unsigned LaneNumber,

                                         unsigned *DestReg) const {

  for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();

       MII != MIE;) {

    MII--;

    MachineInstr *CurrentMI = &*MII;


    if (CurrentMI->getOpcode() == DupOpcode &&

        CurrentMI->getNumOperands() == 3 &&

        CurrentMI->getOperand(1).getReg() == SrcReg &&

        CurrentMI->getOperand(2).getImm() == LaneNumber) {

      *DestReg = CurrentMI->getOperand(0).getReg();

      return true;

    }

  }


  return false;

}


/// Certain SIMD instructions with vector element operand are not efficient.

/// Rewrite them into SIMD instructions with vector operands. This rewrite

/// is driven by the latency of the instructions.

/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,

/// and FMULX and hence they are hardcoded.

///

/// For example:

///    fmla v0.4s, v1.4s, v2.s[1]

///

/// Is rewritten into

///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant

///    fmla v0.4s, v1.4s, v3.4s

///

/// Return true if the SIMD instruction is modified.

bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {

  const MCInstrDesc *MulMCID, *DupMCID;

  const TargetRegisterClass *RC = &AArch64::FPR128RegClass;


  switch (MI.getOpcode()) {

  default:

    return false;


  // 4X32 instructions

  case AArch64::FMLAv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMLAv4f32);

    break;

  case AArch64::FMLSv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMLSv4f32);

    break;

  case AArch64::FMULXv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMULXv4f32);

    break;

  case AArch64::FMULv4i32_indexed:

    DupMCID = &TII->get(AArch64::DUPv4i32lane);

    MulMCID = &TII->get(AArch64::FMULv4f32);

    break;


  // 2X64 instructions

  case AArch64::FMLAv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMLAv2f64);

    break;

  case AArch64::FMLSv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMLSv2f64);

    break;

  case AArch64::FMULXv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMULXv2f64);

    break;

  case AArch64::FMULv2i64_indexed:

    DupMCID = &TII->get(AArch64::DUPv2i64lane);

    MulMCID = &TII->get(AArch64::FMULv2f64);

    break;


  // 2X32 instructions

  case AArch64::FMLAv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMLAv2f32);

    break;

  case AArch64::FMLSv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMLSv2f32);

    break;

  case AArch64::FMULXv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMULXv2f32);

    break;

  case AArch64::FMULv2i32_indexed:

    RC = &AArch64::FPR64RegClass;

    DupMCID = &TII->get(AArch64::DUPv2i32lane);

    MulMCID = &TII->get(AArch64::FMULv2f32);

    break;

  }


  SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;

  ReplInstrMCID.push_back(DupMCID);

  ReplInstrMCID.push_back(MulMCID);

  if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),

                         ReplInstrMCID))

    return false;


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock &MBB = *MI.getParent();

  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();


  // Get the operands of the current SIMD arithmetic instruction.

  Register MulDest = MI.getOperand(0).getReg();

  Register SrcReg0 = MI.getOperand(1).getReg();

  unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());

  Register SrcReg1 = MI.getOperand(2).getReg();

  unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());

  unsigned DupDest;


  // Instructions of interest have either 4 or 5 operands.

  if (MI.getNumOperands() == 5) {

    Register SrcReg2 = MI.getOperand(3).getReg();

    unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());

    unsigned LaneNumber = MI.getOperand(4).getImm();

    // Create a new DUP instruction. Note that if an equivalent DUP instruction

    // has already been created before, then use that one instead of creating

    // a new one.

    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {

      DupDest = MRI.createVirtualRegister(RC);

      BuildMI(MBB, MI, DL, *DupMCID, DupDest)

          .addReg(SrcReg2, Src2IsKill)

          .addImm(LaneNumber);

    }

    BuildMI(MBB, MI, DL, *MulMCID, MulDest)

        .addReg(SrcReg0, Src0IsKill)

        .addReg(SrcReg1, Src1IsKill)

        .addReg(DupDest, Src2IsKill);

  } else if (MI.getNumOperands() == 4) {

    unsigned LaneNumber = MI.getOperand(3).getImm();

    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {

      DupDest = MRI.createVirtualRegister(RC);

      BuildMI(MBB, MI, DL, *DupMCID, DupDest)

          .addReg(SrcReg1, Src1IsKill)

          .addImm(LaneNumber);

    }

    BuildMI(MBB, MI, DL, *MulMCID, MulDest)

        .addReg(SrcReg0, Src0IsKill)

        .addReg(DupDest, Src1IsKill);

  } else {

    return false;

  }


  ++NumModifiedInstr;

  return true;

}


/// Load/Store Interleaving instructions are not always beneficial.

/// Replace them by ZIP instructions and classical load/store.

///

/// For example:

///    st2 {v0.4s, v1.4s}, addr

///

/// Is rewritten into:

///    zip1 v2.4s, v0.4s, v1.4s

///    zip2 v3.4s, v0.4s, v1.4s

///    stp  q2, q3, addr

//

/// For example:

///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr

///

/// Is rewritten into:

///    zip1 v4.4s, v0.4s, v2.4s

///    zip2 v5.4s, v0.4s, v2.4s

///    zip1 v6.4s, v1.4s, v3.4s

///    zip2 v7.4s, v1.4s, v3.4s

///    zip1 v8.4s, v4.4s, v6.4s

///    zip2 v9.4s, v4.4s, v6.4s

///    zip1 v10.4s, v5.4s, v7.4s

///    zip2 v11.4s, v5.4s, v7.4s

///    stp  q8, q9, addr

///    stp  q10, q11, addr+32

///

/// Currently only instructions related to ST2 and ST4 are considered.

/// Other may be added later.

/// Return true if the SIMD instruction is modified.

bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {


  unsigned SeqReg, AddrReg;

  unsigned StReg[4], StRegKill[4];

  MachineInstr *DefiningMI;

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock &MBB = *MI.getParent();

  SmallVector<unsigned, MaxNumRepl> ZipDest;

  SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;


  // If current instruction matches any of the rewriting rules, then

  // gather information about parameters of the new instructions.

  bool Match = false;

  for (auto &I : IRT) {

    if (MI.getOpcode() == I.OrigOpc) {

      SeqReg  = MI.getOperand(0).getReg();

      AddrReg = MI.getOperand(1).getReg();

      DefiningMI = MRI->getUniqueVRegDef(SeqReg);

      unsigned NumReg = determineSrcReg(MI);

      if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))

        return false;


      for (auto &Repl : I.ReplOpc) {

        ReplInstrMCID.push_back(&TII->get(Repl));

        // Generate destination registers but only for non-store instruction.

        if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)

          ZipDest.push_back(MRI->createVirtualRegister(&I.RC));

      }

      Match = true;

      break;

    }

  }


  if (!Match)

    return false;


  // Determine if it is profitable to replace MI by the series of instructions

  // represented in ReplInstrMCID.

  if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),

                         ReplInstrMCID))

    return false;


  // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at

  // this point, the code generation is hardcoded and does not rely on the IRT

  // table used above given that code generation for ST2 replacement is somewhat

  // different than for ST4 replacement. We could have added more info into the

  // table related to how we build new instructions but we may be adding more

  // complexity with that).

  switch (MI.getOpcode()) {

  default:

    return false;


  case AArch64::ST2Twov16b:

  case AArch64::ST2Twov8b:

  case AArch64::ST2Twov8h:

  case AArch64::ST2Twov4h:

  case AArch64::ST2Twov4s:

  case AArch64::ST2Twov2s:

  case AArch64::ST2Twov2d:

    // ZIP instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])

        .addReg(StReg[0])

        .addReg(StReg[1]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])

        .addReg(StReg[0], StRegKill[0])

        .addReg(StReg[1], StRegKill[1]);

    // STP instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[2])

        .addReg(ZipDest[0])

        .addReg(ZipDest[1])

        .addReg(AddrReg)

        .addImm(0);

    break;


  case AArch64::ST4Fourv16b:

  case AArch64::ST4Fourv8b:

  case AArch64::ST4Fourv8h:

  case AArch64::ST4Fourv4h:

  case AArch64::ST4Fourv4s:

  case AArch64::ST4Fourv2s:

  case AArch64::ST4Fourv2d:

    // ZIP instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])

        .addReg(StReg[0])

        .addReg(StReg[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])

        .addReg(StReg[0], StRegKill[0])

        .addReg(StReg[2], StRegKill[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])

        .addReg(StReg[1])

        .addReg(StReg[3]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])

        .addReg(StReg[1], StRegKill[1])

        .addReg(StReg[3], StRegKill[3]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])

        .addReg(ZipDest[0])

        .addReg(ZipDest[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])

        .addReg(ZipDest[0])

        .addReg(ZipDest[2]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])

        .addReg(ZipDest[1])

        .addReg(ZipDest[3]);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])

        .addReg(ZipDest[1])

        .addReg(ZipDest[3]);

    // stp instructions

    BuildMI(MBB, MI, DL, *ReplInstrMCID[8])

        .addReg(ZipDest[4])

        .addReg(ZipDest[5])

        .addReg(AddrReg)

        .addImm(0);

    BuildMI(MBB, MI, DL, *ReplInstrMCID[9])

        .addReg(ZipDest[6])

        .addReg(ZipDest[7])

        .addReg(AddrReg)

        .addImm(2);

    break;

  }


  ++NumModifiedInstr;

  return true;

}


/// Process The REG_SEQUENCE instruction, and extract the source

/// operands of the ST2/4 instruction from it.

/// Example of such instruction.

///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;

/// Return true when the instruction is processed successfully.

bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,

     unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {

  assert(DefiningMI != nullptr);

  if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)

    return false;


  for (unsigned i=0; i<NumArg; i++) {

    StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();

    StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());


    // Validation check for the other arguments.

    if (DefiningMI->getOperand(2*i+2).isImm()) {

      switch (DefiningMI->getOperand(2*i+2).getImm()) {

      default:

        return false;


      case AArch64::dsub0:

      case AArch64::dsub1:

      case AArch64::dsub2:

      case AArch64::dsub3:

      case AArch64::qsub0:

      case AArch64::qsub1:

      case AArch64::qsub2:

      case AArch64::qsub3:

        break;

      }

    }

    else

      return false;

  }

  return true;

}


/// Return the number of useful source registers for this instruction

/// (2 for ST2 and 4 for ST4).

unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {

  switch (MI.getOpcode()) {

  default:

    llvm_unreachable("Unsupported instruction for this pass");


  case AArch64::ST2Twov16b:

  case AArch64::ST2Twov8b:

  case AArch64::ST2Twov8h:

  case AArch64::ST2Twov4h:

  case AArch64::ST2Twov4s:

  case AArch64::ST2Twov2s:

  case AArch64::ST2Twov2d:

    return 2;


  case AArch64::ST4Fourv16b:

  case AArch64::ST4Fourv8b:

  case AArch64::ST4Fourv8h:

  case AArch64::ST4Fourv4h:

  case AArch64::ST4Fourv4s:

  case AArch64::ST4Fourv2s:

  case AArch64::ST4Fourv2d:

    return 4;

  }

}


bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {

  if (skipFunction(MF.getFunction()))

    return false;


  TII = MF.getSubtarget().getInstrInfo();

  MRI = &MF.getRegInfo();

  const TargetSubtargetInfo &ST = MF.getSubtarget();

  const AArch64InstrInfo *AAII =

      static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());

  if (!AAII)

    return false;

  SchedModel.init(&ST);

  if (!SchedModel.hasInstrSchedModel())

    return false;


  bool Changed = false;

  for (auto OptimizationKind : {VectorElem, Interleave}) {

    if (!shouldExitEarly(&MF, OptimizationKind)) {

      SmallVector<MachineInstr *, 8> RemoveMIs;

      for (MachineBasicBlock &MBB : MF) {

        for (MachineInstr &MI : MBB) {

          bool InstRewrite;

          if (OptimizationKind == VectorElem)

            InstRewrite = optimizeVectElement(MI) ;

          else

            InstRewrite = optimizeLdStInterleave(MI);

          if (InstRewrite) {

            // Add MI to the list of instructions to be removed given that it

            // has been replaced.

            RemoveMIs.push_back(&MI);

            Changed = true;

          }

        }

      }

      for (MachineInstr *MI : RemoveMIs)

        MI->eraseFromParent();

    }

  }


  return Changed;

}


/// Returns an instance of the high cost ASIMD instruction replacement

/// optimization pass.

FunctionPass *llvm::createAArch64SIMDInstrOptPass() {

  return new AArch64SIMDInstrOpt();

}

MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:103

AArch64InstrInfo.h

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:228

RuleST4
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
Definition: AArch64SIMDInstrOpt.cpp:96

RuleST2
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
Definition: AArch64SIMDInstrOpt.cpp:94

AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
Definition: AArch64SIMDInstrOpt.cpp:62

MBB
MachineBasicBlock & MBB
Definition: ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:110

MCInstrDesc.h

MCSchedule.h

I
#define I(x, y, z)
Definition: MD5.cpp:58

MachineBasicBlock.h

MachineFunctionPass.h

MachineFunction.h

MachineInstrBuilder.h

MachineInstr.h

MachineOperand.h

MachineRegisterInfo.h

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:56

Pass.h

SmallVector.h
This file defines the SmallVector class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167

StringRef.h

TargetInstrInfo.h

TargetSchedule.h

TargetSubtargetInfo.h

bool

llvm::AArch64InstrInfo
Definition: AArch64InstrInfo.h:180

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:124

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314

llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199

llvm::MCInstrDesc::getOpcode
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition: MCInstrDesc.h:231

llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:122

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:323

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:31

llvm::MachineFunctionPass::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...

llvm::MachineFunction
Definition: MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:762

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:772

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:733

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:160

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:126

llvm::MachineInstrBundleIterator< MachineInstr >

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:72

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587

llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:590

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595

llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:556

llvm::MachineOperand::isKill
bool isKill() const
Definition: MachineOperand.h:398

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:330

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:368

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:53

llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85

llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574

llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:611

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:414

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55

llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:114

llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45

llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:31

llvm::TargetSubtargetInfo
TargetSubtargetInfo - Generic base class for all target subtargets.
Definition: TargetSubtargetInfo.h:65

llvm::TargetSubtargetInfo::getInstrInfo
virtual const TargetInstrInfo * getInstrInfo() const
Definition: TargetSubtargetInfo.h:99

unsigned

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:164

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:369

llvm::createAArch64SIMDInstrOptPass
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
Definition: AArch64SIMDInstrOpt.cpp:737

llvm::getKillRegState
unsigned getKillRegState(bool B)
Definition: MachineInstrBuilder.h:543

llvm::MCSchedClassDesc
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:123

llvm::MCSchedClassDesc::isValid
bool isValid() const
Definition: MCSchedule.h:141

llvm::MCSchedClassDesc::isVariant
bool isVariant() const
Definition: MCSchedule.h:144