52#include <unordered_map>
57#define DEBUG_TYPE "aarch64-simdinstr-opt"
60 "Number of SIMD instructions modified");
62#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
63 "AArch64 SIMD instructions optimization pass"
77 std::map<std::pair<unsigned, std::string>,
bool> SIMDInstrTable;
80 std::unordered_map<std::string, bool> InterlEarlyExit;
90 std::vector<unsigned> ReplOpc;
94#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
95 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
96#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
97 OpcR7, OpcR8, OpcR9, RC) \
99 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
102 std::vector<InstReplInfo> IRT = {
104 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
105 AArch64::STPQi, AArch64::FPR128RegClass),
106 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
107 AArch64::STPQi, AArch64::FPR128RegClass),
108 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
109 AArch64::STPDi, AArch64::FPR64RegClass),
110 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
111 AArch64::STPQi, AArch64::FPR128RegClass),
112 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
113 AArch64::STPDi, AArch64::FPR64RegClass),
114 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
115 AArch64::STPQi, AArch64::FPR128RegClass),
116 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
117 AArch64::STPDi, AArch64::FPR64RegClass),
119 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
120 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
121 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
122 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
123 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
124 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
125 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
126 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
127 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
128 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
129 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
130 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
131 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
132 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
133 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
134 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
135 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
136 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
137 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
138 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
139 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
140 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
141 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
142 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
143 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
144 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
145 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
146 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
151 static const unsigned MaxNumRepl = 10;
172 bool reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
173 unsigned LaneNumber,
unsigned *DestReg)
const;
186 bool processSeqRegInst(
MachineInstr *DefiningMI,
unsigned* StReg,
187 unsigned* StRegKill,
unsigned NumArg)
const;
205char AArch64SIMDInstrOpt::ID = 0;
216bool AArch64SIMDInstrOpt::
221 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
222 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
223 auto It = SIMDInstrTable.find(InstID);
224 if (It != SIMDInstrTable.end())
227 unsigned SCIdx = InstDesc->getSchedClass();
229 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
236 SIMDInstrTable[InstID] =
false;
239 for (
const auto *IDesc : InstDescRepl)
241 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
242 IDesc->getSchedClass());
245 SIMDInstrTable[InstID] =
false;
251 unsigned ReplCost = 0;
252 for (
const auto *IDesc :InstDescRepl)
253 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
255 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
257 SIMDInstrTable[InstID] =
true;
262 SIMDInstrTable[InstID] =
false;
273bool AArch64SIMDInstrOpt::shouldExitEarly(
MachineFunction *MF, Subpass SP) {
282 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
283 ReplInstrMCID.
push_back(&
TII->get(AArch64::DUPv4i32lane));
285 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
291 std::string Subtarget =
292 std::string(SchedModel.getSubtargetInfo()->getCPU());
293 auto It = InterlEarlyExit.find(Subtarget);
294 if (It != InterlEarlyExit.end())
297 for (
auto &
I : IRT) {
298 OriginalMCID = &
TII->get(
I.OrigOpc);
299 for (
auto &Repl :
I.ReplOpc)
301 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
302 InterlEarlyExit[Subtarget] =
false;
305 ReplInstrMCID.
clear();
307 InterlEarlyExit[Subtarget] =
true;
318bool AArch64SIMDInstrOpt::reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
319 unsigned SrcReg,
unsigned LaneNumber,
320 unsigned *DestReg)
const {
326 if (CurrentMI->
getOpcode() == DupOpcode &&
356 switch (
MI.getOpcode()) {
361 case AArch64::FMLAv4i32_indexed:
362 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
363 MulMCID = &
TII->get(AArch64::FMLAv4f32);
365 case AArch64::FMLSv4i32_indexed:
366 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
367 MulMCID = &
TII->get(AArch64::FMLSv4f32);
369 case AArch64::FMULXv4i32_indexed:
370 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
371 MulMCID = &
TII->get(AArch64::FMULXv4f32);
373 case AArch64::FMULv4i32_indexed:
374 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
375 MulMCID = &
TII->get(AArch64::FMULv4f32);
379 case AArch64::FMLAv2i64_indexed:
380 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
381 MulMCID = &
TII->get(AArch64::FMLAv2f64);
383 case AArch64::FMLSv2i64_indexed:
384 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
385 MulMCID = &
TII->get(AArch64::FMLSv2f64);
387 case AArch64::FMULXv2i64_indexed:
388 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
389 MulMCID = &
TII->get(AArch64::FMULXv2f64);
391 case AArch64::FMULv2i64_indexed:
392 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
393 MulMCID = &
TII->get(AArch64::FMULv2f64);
397 case AArch64::FMLAv2i32_indexed:
398 RC = &AArch64::FPR64RegClass;
399 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
400 MulMCID = &
TII->get(AArch64::FMLAv2f32);
402 case AArch64::FMLSv2i32_indexed:
403 RC = &AArch64::FPR64RegClass;
404 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
405 MulMCID = &
TII->get(AArch64::FMLSv2f32);
407 case AArch64::FMULXv2i32_indexed:
408 RC = &AArch64::FPR64RegClass;
409 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
410 MulMCID = &
TII->get(AArch64::FMULXv2f32);
412 case AArch64::FMULv2i32_indexed:
413 RC = &AArch64::FPR64RegClass;
414 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
415 MulMCID = &
TII->get(AArch64::FMULv2f32);
422 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
439 if (
MI.getNumOperands() == 5) {
442 unsigned LaneNumber =
MI.getOperand(4).getImm();
446 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
447 DupDest =
MRI.createVirtualRegister(RC);
449 .
addReg(SrcReg2, Src2IsKill)
453 .
addReg(SrcReg0, Src0IsKill)
454 .
addReg(SrcReg1, Src1IsKill)
455 .
addReg(DupDest, Src2IsKill);
456 }
else if (
MI.getNumOperands() == 4) {
457 unsigned LaneNumber =
MI.getOperand(3).getImm();
458 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
459 DupDest =
MRI.createVirtualRegister(RC);
461 .
addReg(SrcReg1, Src1IsKill)
465 .
addReg(SrcReg0, Src0IsKill)
466 .
addReg(DupDest, Src1IsKill);
504bool AArch64SIMDInstrOpt::optimizeLdStInterleave(
MachineInstr &
MI) {
506 unsigned SeqReg, AddrReg;
507 unsigned StReg[4], StRegKill[4];
517 for (
auto &
I : IRT) {
518 if (
MI.getOpcode() ==
I.OrigOpc) {
519 SeqReg =
MI.getOperand(0).getReg();
520 AddrReg =
MI.getOperand(1).getReg();
521 DefiningMI =
MRI->getUniqueVRegDef(SeqReg);
522 unsigned NumReg = determineSrcReg(
MI);
523 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
526 for (
auto &Repl :
I.ReplOpc) {
529 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
542 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
552 switch (
MI.getOpcode()) {
556 case AArch64::ST2Twov16b:
557 case AArch64::ST2Twov8b:
558 case AArch64::ST2Twov8h:
559 case AArch64::ST2Twov4h:
560 case AArch64::ST2Twov4s:
561 case AArch64::ST2Twov2s:
562 case AArch64::ST2Twov2d:
568 .
addReg(StReg[0], StRegKill[0])
569 .
addReg(StReg[1], StRegKill[1]);
578 case AArch64::ST4Fourv16b:
579 case AArch64::ST4Fourv8b:
580 case AArch64::ST4Fourv8h:
581 case AArch64::ST4Fourv4h:
582 case AArch64::ST4Fourv4s:
583 case AArch64::ST4Fourv2s:
584 case AArch64::ST4Fourv2d:
590 .
addReg(StReg[0], StRegKill[0])
591 .
addReg(StReg[2], StRegKill[2]);
596 .
addReg(StReg[1], StRegKill[1])
597 .
addReg(StReg[3], StRegKill[3]);
633bool AArch64SIMDInstrOpt::processSeqRegInst(
MachineInstr *DefiningMI,
634 unsigned* StReg,
unsigned* StRegKill,
unsigned NumArg)
const {
635 assert(DefiningMI !=
nullptr);
636 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
639 for (
unsigned i=0; i<NumArg; i++) {
668unsigned AArch64SIMDInstrOpt::determineSrcReg(
MachineInstr &
MI)
const {
669 switch (
MI.getOpcode()) {
673 case AArch64::ST2Twov16b:
674 case AArch64::ST2Twov8b:
675 case AArch64::ST2Twov8h:
676 case AArch64::ST2Twov4h:
677 case AArch64::ST2Twov4s:
678 case AArch64::ST2Twov2s:
679 case AArch64::ST2Twov2d:
682 case AArch64::ST4Fourv16b:
683 case AArch64::ST4Fourv8b:
684 case AArch64::ST4Fourv8h:
685 case AArch64::ST4Fourv4h:
686 case AArch64::ST4Fourv4s:
687 case AArch64::ST4Fourv2s:
688 case AArch64::ST4Fourv2d:
704 SchedModel.init(&ST);
705 if (!SchedModel.hasInstrSchedModel())
708 bool Changed =
false;
709 for (
auto OptimizationKind : {VectorElem, Interleave}) {
710 if (!shouldExitEarly(&MF, OptimizationKind)) {
715 if (OptimizationKind == VectorElem)
716 InstRewrite = optimizeVectElement(
MI) ;
718 InstRewrite = optimizeLdStInterleave(
MI);
728 MI->eraseFromParent();
738 return new AArch64SIMDInstrOpt();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
TargetInstrInfo - Interface to description of machine instruction set.
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
unsigned getKillRegState(bool B)
Summarize the scheduling resources required for an instruction of a particular scheduling class.