24#define DEBUG_TYPE "si-optimize-exec-masking"
28class SIOptimizeExecMasking {
58 bool IgnoreStart =
false)
const;
65 unsigned MaxInstructions = 20)
const;
66 bool optimizeExecSequence();
68 bool optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr,
72 bool optimizeOrSaveexecXorSequences();
86 return "SI optimize exec mask operations";
100 SIOptimizeExecMasking Impl(&MF);
111 "SI optimize exec mask operations",
false,
false)
116char SIOptimizeExecMaskingLegacy::
ID = 0;
122 switch (
MI.getOpcode()) {
124 case AMDGPU::S_MOV_B64:
125 case AMDGPU::S_MOV_B64_term:
126 case AMDGPU::S_MOV_B32:
127 case AMDGPU::S_MOV_B32_term: {
128 const MachineOperand &Src = MI.getOperand(1);
129 if (Src.isReg() && Src.getReg() == LMC.ExecReg)
130 return MI.getOperand(0).getReg();
134 return AMDGPU::NoRegister;
139 switch (
MI.getOpcode()) {
141 case AMDGPU::S_MOV_B64:
142 case AMDGPU::S_MOV_B32: {
144 if (Dst.isReg() && Dst.getReg() == LMC.ExecReg &&
MI.getOperand(1).isReg())
145 return MI.getOperand(1).getReg();
148 case AMDGPU::S_MOV_B64_term:
149 case AMDGPU::S_MOV_B32_term:
159 switch (
MI.getOpcode()) {
160 case AMDGPU::S_AND_B64:
161 case AMDGPU::S_OR_B64:
162 case AMDGPU::S_XOR_B64:
163 case AMDGPU::S_ANDN2_B64:
164 case AMDGPU::S_ORN2_B64:
165 case AMDGPU::S_NAND_B64:
166 case AMDGPU::S_NOR_B64:
167 case AMDGPU::S_XNOR_B64: {
170 return MI.getOperand(0).getReg();
173 return MI.getOperand(0).getReg();
176 case AMDGPU::S_AND_B32:
177 case AMDGPU::S_OR_B32:
178 case AMDGPU::S_XOR_B32:
179 case AMDGPU::S_ANDN2_B32:
180 case AMDGPU::S_ORN2_B32:
181 case AMDGPU::S_NAND_B32:
182 case AMDGPU::S_NOR_B32:
183 case AMDGPU::S_XNOR_B32: {
185 if (Src1.
isReg() && Src1.
getReg() == AMDGPU::EXEC_LO)
186 return MI.getOperand(0).getReg();
188 if (Src2.
isReg() && Src2.
getReg() == AMDGPU::EXEC_LO)
189 return MI.getOperand(0).getReg();
194 return AMDGPU::NoRegister;
199 case AMDGPU::S_AND_B64:
200 return AMDGPU::S_AND_SAVEEXEC_B64;
201 case AMDGPU::S_OR_B64:
202 return AMDGPU::S_OR_SAVEEXEC_B64;
203 case AMDGPU::S_XOR_B64:
204 return AMDGPU::S_XOR_SAVEEXEC_B64;
205 case AMDGPU::S_ANDN2_B64:
206 return AMDGPU::S_ANDN2_SAVEEXEC_B64;
207 case AMDGPU::S_ORN2_B64:
208 return AMDGPU::S_ORN2_SAVEEXEC_B64;
209 case AMDGPU::S_NAND_B64:
210 return AMDGPU::S_NAND_SAVEEXEC_B64;
211 case AMDGPU::S_NOR_B64:
212 return AMDGPU::S_NOR_SAVEEXEC_B64;
213 case AMDGPU::S_XNOR_B64:
214 return AMDGPU::S_XNOR_SAVEEXEC_B64;
215 case AMDGPU::S_AND_B32:
216 return AMDGPU::S_AND_SAVEEXEC_B32;
217 case AMDGPU::S_OR_B32:
218 return AMDGPU::S_OR_SAVEEXEC_B32;
219 case AMDGPU::S_XOR_B32:
220 return AMDGPU::S_XOR_SAVEEXEC_B32;
221 case AMDGPU::S_ANDN2_B32:
222 return AMDGPU::S_ANDN2_SAVEEXEC_B32;
223 case AMDGPU::S_ORN2_B32:
224 return AMDGPU::S_ORN2_SAVEEXEC_B32;
225 case AMDGPU::S_NAND_B32:
226 return AMDGPU::S_NAND_SAVEEXEC_B32;
227 case AMDGPU::S_NOR_B32:
228 return AMDGPU::S_NOR_SAVEEXEC_B32;
229 case AMDGPU::S_XNOR_B32:
230 return AMDGPU::S_XNOR_SAVEEXEC_B32;
232 return AMDGPU::INSTRUCTION_LIST_END;
238bool SIOptimizeExecMasking::removeTerminatorBit(
MachineInstr &
MI)
const {
239 switch (
MI.getOpcode()) {
240 case AMDGPU::S_MOV_B32_term: {
241 bool RegSrc =
MI.getOperand(1).isReg();
242 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
245 case AMDGPU::S_MOV_B64_term: {
246 bool RegSrc =
MI.getOperand(1).isReg();
247 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
250 case AMDGPU::S_XOR_B64_term: {
253 MI.setDesc(
TII->get(AMDGPU::S_XOR_B64));
256 case AMDGPU::S_XOR_B32_term: {
259 MI.setDesc(
TII->get(AMDGPU::S_XOR_B32));
262 case AMDGPU::S_OR_B64_term: {
265 MI.setDesc(
TII->get(AMDGPU::S_OR_B64));
268 case AMDGPU::S_OR_B32_term: {
271 MI.setDesc(
TII->get(AMDGPU::S_OR_B32));
274 case AMDGPU::S_ANDN2_B64_term: {
277 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B64));
280 case AMDGPU::S_ANDN2_B32_term: {
283 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B32));
286 case AMDGPU::S_AND_B64_term: {
289 MI.setDesc(
TII->get(AMDGPU::S_AND_B64));
292 case AMDGPU::S_AND_B32_term: {
295 MI.setDesc(
TII->get(AMDGPU::S_AND_B32));
312 for (;
I !=
E; ++
I) {
313 if (!
I->isTerminator())
314 return Seen ? FirstNonTerm :
I;
316 if (removeTerminatorBit(*
I)) {
329 const unsigned InstLimit = 25;
332 for (
unsigned N = 0;
N <= InstLimit &&
I !=
E; ++
I, ++
N) {
333 Register CopyFromExec = isCopyFromExec(*
I);
346 if (Succ->isLiveIn(
Reg))
362 unsigned MaxInstructions)
const {
365 unsigned CurrentIteration = 0;
367 for (++
A; CurrentIteration < MaxInstructions &&
A !=
E; ++
A) {
368 if (
A->isDebugInstr())
375 if (
A->modifiesRegister(
Reg,
TRI))
382 if (Terminator && KillFlagCandidates &&
A != Terminator &&
385 if (MO.isReg() && MO.isKill()) {
387 if (Candidate !=
Reg &&
TRI->regsOverlap(Candidate,
Reg))
406bool SIOptimizeExecMasking::isRegisterInUseBetween(
MachineInstr &Stop,
410 bool IgnoreStart)
const {
424 return !LR.available(
Reg) ||
MRI->isReserved(
Reg);
429bool SIOptimizeExecMasking::isRegisterInUseAfter(
MachineInstr &Stop,
444bool SIOptimizeExecMasking::optimizeExecSequence() {
456 unsigned SearchCount = 0;
457 const unsigned SearchLimit = 5;
458 while (
I !=
E && SearchCount++ < SearchLimit) {
459 CopyToExec = isCopyToExec(*
I);
469 auto *CopyToExecInst = &*
I;
470 auto CopyFromExecInst = findExecCopy(
MBB,
I);
471 if (CopyFromExecInst ==
E) {
472 auto PrepareExecInst = std::next(
I);
473 if (PrepareExecInst ==
E)
476 if (CopyToExecInst->getOperand(1).isKill() &&
480 PrepareExecInst->getOperand(0).setReg(LMC.ExecReg);
484 CopyToExecInst->eraseFromParent();
497 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
502 J = std::next(CopyFromExecInst->getIterator()),
503 JE =
I->getIterator();
505 if (SaveExecInst && J->readsRegister(LMC.ExecReg,
TRI)) {
506 LLVM_DEBUG(
dbgs() <<
"exec read prevents saveexec: " << *J <<
'\n');
509 SaveExecInst =
nullptr;
513 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec,
TRI);
515 if (J->modifiesRegister(CopyToExec,
TRI)) {
519 SaveExecInst =
nullptr;
524 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
527 if (ReadsCopyFromExec) {
529 LLVM_DEBUG(
dbgs() <<
"Found save exec op: " << *SaveExecInst <<
'\n');
532 LLVM_DEBUG(
dbgs() <<
"Instruction does not read exec copy: " << *J
536 if (ReadsCopyFromExec && !SaveExecInst) {
545 LLVM_DEBUG(
dbgs() <<
"Found second use of save inst candidate: " << *J
550 if (SaveExecInst && J->readsRegister(CopyToExec,
TRI)) {
551 assert(SaveExecInst != &*J);
559 LLVM_DEBUG(
dbgs() <<
"Insert save exec op: " << *SaveExecInst <<
'\n');
566 if (Src0.
isReg() && Src0.
getReg() == CopyFromExec) {
568 }
else if (Src1.
isReg() && Src1.
getReg() == CopyFromExec) {
576 CopyFromExecInst->eraseFromParent();
586 CopyToExecInst->eraseFromParent();
589 OtherInst->substituteRegister(CopyToExec, LMC.ExecReg,
590 AMDGPU::NoSubRegister, *
TRI);
601bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
615 bool IsSGPR32 =
TRI->getRegSizeInBits(MoveDest, *
MRI) == 32;
616 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
627 auto TryAddImmediateValueFromNamedOperand =
628 [&](AMDGPU::OpName OperandName) ->
void {
629 if (
auto *
Mod =
TII->getNamedOperand(VCmp, OperandName))
630 Builder.addImm(
Mod->getImm());
633 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
636 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
639 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
641 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::op_sel);
650 MO->setIsKill(
false);
665void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
667 if (!
ST->hasGFX10_3Insts())
670 if (
MI.getOpcode() != LMC.AndSaveExecOpc)
673 Register SaveExecDest =
MI.getOperand(0).getReg();
674 if (!
TRI->isSGPRReg(*
MRI, SaveExecDest))
678 if (!SaveExecSrc0->
isReg())
690 VCmp = findInstrBackwards(
696 {LMC.ExecReg, SaveExecSrc0->
getReg()});
702 assert(VCmpDest &&
"Should have an sdst operand!");
724 if (isRegisterInUseBetween(*VCmp,
MI, VCmpDest->
getReg(),
false,
true) ||
725 isRegisterInUseAfter(
MI, VCmpDest->
getReg()))
739 if (!findInstrBackwards(
741 VCmp, &KillFlagCandidates))
745 SaveExecVCmpMapping[&
MI] = VCmp;
753void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(
MachineInstr &
MI) {
754 if (
MI.getOpcode() == LMC.XorOpc && &
MI != &
MI.getParent()->front()) {
761 (XorSrc0.
getReg() == LMC.ExecReg || XorSrc1.
getReg() == LMC.ExecReg)) {
766 if (PossibleOrSaveexec.
getOpcode() != LMC.OrSaveExecOpc)
772 if ((XorSrc0.
getReg() == LMC.ExecReg &&
775 XorSrc1.
getReg() == LMC.ExecReg)) {
776 OrXors.emplace_back(&PossibleOrSaveexec, &
MI);
783bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
784 if (OrXors.empty()) {
790 for (
const auto &Pair : OrXors) {
793 std::tie(
Or,
Xor) = Pair;
794 BuildMI(*
Or->getParent(),
Or->getIterator(),
Or->getDebugLoc(),
795 TII->get(LMC.AndN2SaveExecOpc),
Or->getOperand(0).getReg())
796 .
addReg(
Or->getOperand(1).getReg());
798 Or->eraseFromParent();
799 Xor->eraseFromParent();
807bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(
MachineFunction &MF) {
811 return SIOptimizeExecMasking(&MF).run();
814bool SIOptimizeExecMasking::run() {
815 bool Changed = optimizeExecSequence();
818 SaveExecVCmpMapping.clear();
819 KillFlagCandidates.clear();
820 static unsigned SearchWindow = 10;
822 unsigned SearchCount = 0;
825 if (
MI.isDebugInstr())
828 if (SearchCount >= SearchWindow) {
832 tryRecordOrSaveexecXorSequence(
MI);
833 tryRecordVCmpxAndSaveexecSequence(
MI);
835 if (
MI.modifiesRegister(LMC.ExecReg,
TRI)) {
843 Changed |= optimizeOrSaveexecXorSequences();
844 for (
const auto &Entry : SaveExecVCmpMapping) {
848 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static unsigned getSaveExecOp(unsigned Opc)
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Interface definition for SIRegisterInfo.
This file defines the SmallVector class.
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Represents analyses that only rely on functions' control flow.
A set of register units used to track register liveness.
Wrapper class representing physical registers. Should be passed by value.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
reverse_iterator rbegin()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
mop_range uses()
Returns all operands which may be register uses.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
reverse_self_iterator getReverseIterator()
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVCMPXOpFromVCMP(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
std::reverse_iterator< iterator > rend() const
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIOptimizeExecMaskingLegacyID
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ Mod
The access may modify the value stored in memory.
@ Xor
Bitwise or logical XOR of integers.
void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.