31#define DEBUG_TYPE "si-pre-emit-peephole"
35class SIPreEmitPeephole {
95 return SIPreEmitPeephole().run(MF);
102 "SI peephole optimizations",
false,
false)
104char SIPreEmitPeepholeLegacy::
ID = 0;
108bool SIPreEmitPeephole::optimizeVccBranch(
MachineInstr &
MI)
const {
130 const bool IsWave32 =
ST.isWave32();
131 const unsigned CondReg =
TRI->getVCC();
132 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
133 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
134 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
135 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
139 bool ReadsCond =
false;
140 unsigned Threshold = 5;
144 if (
A->modifiesRegister(ExecReg,
TRI))
146 if (
A->modifiesRegister(CondReg,
TRI)) {
147 if (!
A->definesRegister(CondReg,
TRI) ||
148 (
A->getOpcode() !=
And &&
A->getOpcode() != AndN2))
152 ReadsCond |=
A->readsRegister(CondReg,
TRI);
160 TII->commuteInstruction(*A);
168 int64_t MaskValue = 0;
172 auto M = std::next(A);
173 bool ReadsSreg = false;
174 bool ModifiesExec = false;
175 for (; M != E; ++M) {
176 if (M->definesRegister(SReg, TRI))
178 if (M->modifiesRegister(SReg, TRI))
180 ReadsSreg |= M->readsRegister(SReg, TRI);
181 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
189 if (
A->getOpcode() ==
And && SReg == CondReg && !ModifiesExec &&
191 A->eraseFromParent();
194 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
195 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
197 MaskValue = M->getOperand(1).getImm();
201 A->getOperand(2).ChangeToImmediate(MaskValue);
202 M->eraseFromParent();
204 }
else if (Op2.
isImm()) {
205 MaskValue = Op2.getImm();
207 llvm_unreachable(
"Op2 must be register or immediate");
211 assert(MaskValue == 0 || MaskValue == -1);
212 if (
A->getOpcode() == AndN2)
213 MaskValue = ~MaskValue;
215 if (!ReadsCond &&
A->registerDefIsDead(AMDGPU::SCC,
nullptr)) {
216 if (!MI.killsRegister(CondReg, TRI)) {
218 if (MaskValue == 0) {
219 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
222 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
227 A->eraseFromParent();
230 bool IsVCCZ =
MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
231 if (SReg == ExecReg) {
234 MI.eraseFromParent();
237 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
238 }
else if (IsVCCZ && MaskValue == 0) {
249 Found =
Term.isIdenticalTo(
MI);
252 assert(Found &&
"conditional branch is not terminator");
255 assert(Dst.isMBB() &&
"destination is not basic block");
257 BranchMI->eraseFromParent();
265 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
266 }
else if (!IsVCCZ && MaskValue == 0) {
269 assert(Dst.isMBB() &&
"destination is not basic block");
270 MI.getParent()->removeSuccessor(Dst.getMBB());
271 MI.eraseFromParent();
273 }
else if (MaskValue == -1) {
276 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
279 MI.removeOperand(
MI.findRegisterUseOperandIdx(CondReg,
TRI,
false ));
280 MI.addImplicitDefUseOperands(*
MBB.getParent());
287 MachineBasicBlock &
MBB = *
MI.getParent();
290 MachineOperand *Idx =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
292 SmallVector<MachineInstr *, 4>
ToRemove;
300 E =
MI.getIterator();
304 switch (
I->getOpcode()) {
305 case AMDGPU::S_SET_GPR_IDX_MODE:
307 case AMDGPU::S_SET_GPR_IDX_OFF:
312 if (
I->modifiesRegister(AMDGPU::M0,
TRI))
314 if (IdxReg &&
I->modifiesRegister(IdxReg,
TRI))
317 return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
321 if (!IdxOn || !(
I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
322 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
328 MI.eraseFromBundle();
330 RI->eraseFromBundle();
334bool SIPreEmitPeephole::getBlockDestinations(
335 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
336 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &
Cond) {
347class BranchWeightCostModel {
348 const SIInstrInfo &
TII;
349 const TargetSchedModel &SchedModel;
350 BranchProbability BranchProb;
351 static constexpr uint64_t BranchNotTakenCost = 1;
352 uint64_t BranchTakenCost;
353 uint64_t ThenCyclesCost = 0;
356 BranchWeightCostModel(
const SIInstrInfo &
TII,
const MachineInstr &Branch,
357 const MachineBasicBlock &Succ)
358 :
TII(
TII), SchedModel(
TII.getSchedModel()) {
359 const MachineBasicBlock &Head = *
Branch.getParent();
366 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
370 if (
TII.isWaitcnt(
MI.getOpcode()))
373 ThenCyclesCost += SchedModel.computeInstrLatency(&
MI);
385 return (Denominator - Numerator) * ThenCyclesCost <=
386 ((Denominator - Numerator) * BranchTakenCost +
387 Numerator * BranchNotTakenCost);
391bool SIPreEmitPeephole::mustRetainExeczBranch(
392 const MachineInstr &Branch,
const MachineBasicBlock &From,
393 const MachineBasicBlock &To)
const {
395 BranchWeightCostModel CostModel{*
TII,
Branch, From};
397 const MachineFunction *MF = From.
getParent();
400 const MachineBasicBlock &
MBB = *
MBBI;
402 for (
const MachineInstr &
MI :
MBB) {
406 if (
MI.isConditionalBranch())
409 if (
MI.isUnconditionalBranch() &&
413 if (
MI.isMetaInstruction())
416 if (
TII->hasUnwantedEffectsWhenEXECEmpty(
MI))
419 if (!CostModel.isProfitable(
MI))
429bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &
MI,
430 MachineBasicBlock &SrcMBB) {
432 if (!
TII->getSchedModel().hasInstrSchedModel())
435 MachineBasicBlock *TrueMBB =
nullptr;
436 MachineBasicBlock *FalseMBB =
nullptr;
439 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB,
Cond))
447 if (mustRetainExeczBranch(
MI, *FalseMBB, *TrueMBB))
451 MI.eraseFromParent();
459bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &
MI)
const {
460 if (!
TII->isNeverCoissue(
MI))
462 unsigned Opcode =
MI.getOpcode();
464 case AMDGPU::V_PK_ADD_F32:
465 case AMDGPU::V_PK_MUL_F32:
466 case AMDGPU::V_PK_FMA_F32:
474bool SIPreEmitPeephole::canUnpackingClobberRegister(
const MachineInstr &
MI) {
484 Register UnpackedDstReg =
TRI->getSubReg(DstReg, AMDGPU::sub0);
486 const MachineOperand *Src0MO =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
487 if (Src0MO && Src0MO->
isReg()) {
490 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
492 ?
TRI->getSubReg(SrcReg0, AMDGPU::sub1)
493 :
TRI->getSubReg(SrcReg0, AMDGPU::sub0);
496 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
500 const MachineOperand *Src1MO =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
501 if (Src1MO && Src1MO->
isReg()) {
504 TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
506 ?
TRI->getSubReg(SrcReg1, AMDGPU::sub1)
507 :
TRI->getSubReg(SrcReg1, AMDGPU::sub0);
508 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
515 const MachineOperand *Src2MO =
516 TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
517 if (Src2MO && Src2MO->
isReg()) {
520 TII->getNamedOperand(
MI, AMDGPU::OpName::src2_modifiers)->getImm();
522 ?
TRI->getSubReg(SrcReg2, AMDGPU::sub1)
523 :
TRI->getSubReg(SrcReg2, AMDGPU::sub0);
524 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
531uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &
I) {
532 unsigned Opcode =
I.getOpcode();
537 case AMDGPU::V_PK_ADD_F32:
538 return AMDGPU::V_ADD_F32_e64;
539 case AMDGPU::V_PK_MUL_F32:
540 return AMDGPU::V_MUL_F32_e64;
541 case AMDGPU::V_PK_FMA_F32:
542 return AMDGPU::V_FMA_F32_e64;
544 return std::numeric_limits<uint16_t>::max();
549void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
550 unsigned SrcMods,
bool IsHiBits,
551 const MachineOperand &SrcMO) {
552 unsigned NewSrcMods = 0;
564 if (SrcMods & NegModifier)
575 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
576 ?
TRI->getSubReg(SrcMO.
getReg(), AMDGPU::sub1)
577 :
TRI->getSubReg(SrcMO.
getReg(), AMDGPU::sub0);
579 MachineOperand UnpackedSrcMO =
596 bool KillState =
true;
597 if ((OpSel == OpSelHi) && !IsHiBits)
601 NewMI.
add(UnpackedSrcMO);
604void SIPreEmitPeephole::collectUnpackingCandidates(
605 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
606 uint16_t NumMFMACycles) {
609 int TotalCyclesBetweenCandidates = 0;
610 auto SchedModel =
TII->getSchedModel();
615 if (
Instr.isMetaInstruction())
617 if ((
Instr.isTerminator()) ||
618 (
TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
620 Instr.modifiesRegister(AMDGPU::EXEC,
TRI)))
623 const MCSchedClassDesc *InstrSchedClassDesc =
627 TotalCyclesBetweenCandidates +=
Latency;
629 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
636 for (
const MachineOperand &InstrMO :
Instr.operands()) {
637 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
639 if (
TRI->regsOverlap(MFMADef, InstrMO.getReg()))
642 if (!isUnpackingSupportedInstr(Instr))
645 if (canUnpackingClobberRegister(Instr))
650 TotalCyclesBetweenCandidates -=
Latency;
652 TotalCyclesBetweenCandidates += 2;
654 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
655 InstrsToUnpack.
insert(&Instr);
660void SIPreEmitPeephole::performF32Unpacking(MachineInstr &
I) {
661 MachineOperand DstOp =
I.getOperand(0);
663 uint16_t UnpackedOpcode = mapToUnpackedOpcode(
I);
664 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
665 "Unsupported Opcode");
667 MachineInstrBuilder Op0LOp1L =
668 createUnpackedMI(
I, UnpackedOpcode,
false);
669 MachineOperand LoDstOp = Op0LOp1L->
getOperand(0);
673 MachineInstrBuilder Op0HOp1H =
674 createUnpackedMI(
I, UnpackedOpcode,
true);
675 MachineOperand HiDstOp = Op0HOp1H->
getOperand(0);
677 if (
I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
678 Op0LOp1L->
setFlag(MachineInstr::MIFlag::NoFPExcept);
679 Op0HOp1H->
setFlag(MachineInstr::MIFlag::NoFPExcept);
681 if (
I.getFlag(MachineInstr::MIFlag::FmContract)) {
682 Op0LOp1L->
setFlag(MachineInstr::MIFlag::FmContract);
683 Op0HOp1H->
setFlag(MachineInstr::MIFlag::FmContract);
693MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &
I,
694 uint16_t UnpackedOpcode,
698 const MachineOperand *SrcMO1 =
TII->getNamedOperand(
I, AMDGPU::OpName::src0);
699 const MachineOperand *SrcMO2 =
TII->getNamedOperand(
I, AMDGPU::OpName::src1);
700 Register DstReg =
I.getOperand(0).getReg();
701 unsigned OpCode =
I.getOpcode();
702 Register UnpackedDstReg = IsHiBits ?
TRI->getSubReg(DstReg, AMDGPU::sub1)
703 :
TRI->getSubReg(DstReg, AMDGPU::sub0);
705 int64_t ClampVal =
TII->getNamedOperand(
I, AMDGPU::OpName::clamp)->getImm();
707 TII->getNamedOperand(
I, AMDGPU::OpName::src0_modifiers)->getImm();
709 TII->getNamedOperand(
I, AMDGPU::OpName::src1_modifiers)->getImm();
712 NewMI.
addDef(UnpackedDstReg);
713 addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
714 addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
717 const MachineOperand *SrcMO3 =
718 TII->getNamedOperand(
I, AMDGPU::OpName::src2);
720 TII->getNamedOperand(
I, AMDGPU::OpName::src2_modifiers)->getImm();
721 addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
733 if (!SIPreEmitPeephole().
run(MF))
741 TII = ST.getInstrInfo();
742 TRI = &
TII->getRegisterInfo();
750 if (TermI !=
MBB.end()) {
752 switch (
MI.getOpcode()) {
753 case AMDGPU::S_CBRANCH_VCCZ:
754 case AMDGPU::S_CBRANCH_VCCNZ:
757 case AMDGPU::S_CBRANCH_EXECZ:
763 if (!
ST.hasVGPRIndexMode())
766 MachineInstr *SetGPRMI =
nullptr;
767 const unsigned Threshold = 20;
775 if (
Count == Threshold)
780 if (
MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
789 if (optimizeSetGPR(*SetGPRMI,
MI))
798 for (MachineBasicBlock &
MBB : MF) {
801 auto SchedModel =
TII->getSchedModel();
802 SetVector<MachineInstr *> InstrsToUnpack;
806 const MCSchedClassDesc *SchedClassDesc =
808 uint16_t NumMFMACycles =
810 collectUnpackingCandidates(
MI, InstrsToUnpack, NumMFMACycles);
812 for (MachineInstr *
MI : InstrsToUnpack) {
813 performF32Unpacking(*
MI);
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
LLVM_ABI BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
void setFlag(MIFlag Flag)
Set a MI flag.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
LLVM_ABI bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
static bool isMFMA(const MachineInstr &MI)
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
NodeAddr< InstrNode * > Instr
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...