89#define DEBUG_TYPE "si-wqm"
98 StateStrict = StateStrictWWM | StateStrictWQM,
105 explicit PrintState(
int State) : State(State) {}
111 static const std::pair<char, const char *> Mapping[] = {
112 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
113 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
114 char State = PS.State;
115 for (
auto M : Mapping) {
116 if (State & M.first) {
133 char MarkedStates = 0;
140 char InitialState = 0;
141 bool NeedsLowering =
false;
153class SIWholeQuadMode {
190 std::vector<WorkItem> &Worklist);
192 unsigned SubReg,
char Flag, std::vector<WorkItem> &Worklist);
194 std::vector<WorkItem> &Worklist);
196 std::vector<WorkItem> &Worklist);
197 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist);
198 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
213 Register SaveOrig,
char StrictStateNeeded);
216 char NonStrictState,
char CurrentStrictState);
225 bool lowerLiveMaskQueries();
226 bool lowerCopyInstrs();
227 bool lowerKillInstrs(
bool IsWQM);
241 StringRef getPassName()
const override {
return "SI Whole Quad Mode"; }
258char SIWholeQuadModeLegacy::ID = 0;
271 return new SIWholeQuadModeLegacy;
276 for (
const auto &BII : Blocks) {
279 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
280 <<
", Needs = " << PrintState(BII.second.Needs)
281 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
284 auto III = Instructions.find(&
MI);
285 if (III != Instructions.end()) {
286 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
287 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
294void SIWholeQuadMode::markInstruction(MachineInstr &
MI,
char Flag,
295 std::vector<WorkItem> &Worklist) {
296 InstrInfo &
II = Instructions[&
MI];
298 assert(!(Flag & StateExact) && Flag != 0);
307 Flag &= ~II.Disabled;
311 if ((
II.Needs & Flag) == Flag)
316 Worklist.emplace_back(&
MI);
320void SIWholeQuadMode::markDefs(
const MachineInstr &
UseMI,
LiveRange &LR,
322 std::vector<WorkItem> &Worklist) {
332 const LaneBitmask UseLanes =
343 LaneBitmask DefinedLanes;
345 PhiEntry(
const VNInfo *Phi,
unsigned PredIdx, LaneBitmask DefinedLanes)
346 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
348 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
350 SmallSet<VisitKey, 4> Visited;
351 LaneBitmask DefinedLanes;
352 unsigned NextPredIdx = 0;
354 const VNInfo *NextValue =
nullptr;
355 const VisitKey
Key(
Value, DefinedLanes);
362 if (
Value->isPHIDef()) {
365 assert(
MBB &&
"Phi-def has no defining MBB");
368 unsigned Idx = NextPredIdx;
371 for (; PI != PE && !NextValue; ++PI, ++Idx) {
373 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
383 assert(
MI &&
"Def has no defining instruction");
388 for (
const MachineOperand &
Op :
MI->all_defs()) {
389 if (
Op.getReg() !=
Reg)
393 LaneBitmask OpLanes =
395 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
396 LaneBitmask Overlap = (UseLanes & OpLanes);
399 HasDef |= Overlap.
any();
402 DefinedLanes |= OpLanes;
406 if ((DefinedLanes & UseLanes) != UseLanes) {
409 if (
const VNInfo *VN = LRQ.
valueIn()) {
410 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
417 markInstruction(*
MI, Flag, Worklist);
420 markInstruction(*
MI, Flag, Worklist);
424 if (!NextValue && !PhiStack.
empty()) {
427 NextValue =
Entry.Phi;
428 NextPredIdx =
Entry.PredIdx;
429 DefinedLanes =
Entry.DefinedLanes;
437void SIWholeQuadMode::markOperand(
const MachineInstr &
MI,
438 const MachineOperand &
Op,
char Flag,
439 std::vector<WorkItem> &Worklist) {
446 case AMDGPU::EXEC_LO:
456 markDefs(
MI, LR,
Reg,
Op.getSubReg(), Flag, Worklist);
465 markDefs(
MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
471void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
472 std::vector<WorkItem> &Worklist) {
473 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
476 for (
const MachineOperand &Use :
MI.all_uses())
477 markOperand(
MI, Use, Flag, Worklist);
482char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
483 std::vector<WorkItem> &Worklist) {
484 char GlobalFlags = 0;
486 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
487 bool HasImplicitDerivatives =
494 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
495 for (MachineBasicBlock *
MBB : RPOT) {
496 BlockInfo &BBI = Blocks[
MBB];
498 for (MachineInstr &
MI : *
MBB) {
499 InstrInfo &III = Instructions[&
MI];
500 unsigned Opcode =
MI.getOpcode();
503 if (
TII->isWQM(Opcode)) {
512 markInstructionUses(
MI, StateWQM, Worklist);
513 GlobalFlags |= StateWQM;
515 }
else if (Opcode == AMDGPU::WQM) {
519 LowerToCopyInstrs.insert(&
MI);
520 }
else if (Opcode == AMDGPU::SOFT_WQM) {
521 LowerToCopyInstrs.insert(&
MI);
523 }
else if (Opcode == AMDGPU::STRICT_WWM) {
527 markInstructionUses(
MI, StateStrictWWM, Worklist);
528 GlobalFlags |= StateStrictWWM;
530 }
else if (Opcode == AMDGPU::STRICT_WQM ||
531 TII->isDualSourceBlendEXP(
MI)) {
535 markInstructionUses(
MI, StateStrictWQM, Worklist);
536 GlobalFlags |= StateStrictWQM;
538 if (Opcode == AMDGPU::STRICT_WQM) {
544 BBI.Needs |= StateExact;
545 if (!(BBI.InNeeds & StateExact)) {
546 BBI.InNeeds |= StateExact;
547 Worklist.emplace_back(
MBB);
549 GlobalFlags |= StateExact;
550 III.Disabled = StateWQM | StateStrict;
552 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
553 Opcode == AMDGPU::DS_PARAM_LOAD ||
554 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
555 Opcode == AMDGPU::DS_DIRECT_LOAD) {
558 III.Needs |= StateStrictWQM;
559 GlobalFlags |= StateStrictWQM;
560 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
562 III.Disabled = StateStrict;
563 MachineOperand &Inactive =
MI.getOperand(4);
564 if (Inactive.
isReg()) {
565 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
566 LowerToCopyInstrs.insert(&
MI);
568 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
571 BBI.NeedsLowering =
true;
572 }
else if (
TII->isDisableWQM(
MI)) {
573 BBI.Needs |= StateExact;
574 if (!(BBI.InNeeds & StateExact)) {
575 BBI.InNeeds |= StateExact;
576 Worklist.emplace_back(
MBB);
578 GlobalFlags |= StateExact;
579 III.Disabled = StateWQM | StateStrict;
580 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
581 Opcode == AMDGPU::SI_LIVE_MASK) {
583 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
584 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
585 Opcode == AMDGPU::SI_DEMOTE_I1) {
587 BBI.NeedsLowering =
true;
588 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
589 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
590 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
592 }
else if (WQMOutputs) {
597 for (
const MachineOperand &MO :
MI.defs()) {
600 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(
Reg))) {
608 markInstruction(
MI, Flags, Worklist);
609 GlobalFlags |=
Flags;
618 if (GlobalFlags & StateWQM) {
619 for (MachineInstr *
MI : SetInactiveInstrs)
620 markInstruction(*
MI, StateWQM, Worklist);
621 for (MachineInstr *
MI : SoftWQMInstrs)
622 markInstruction(*
MI, StateWQM, Worklist);
628void SIWholeQuadMode::propagateInstruction(MachineInstr &
MI,
629 std::vector<WorkItem>& Worklist) {
630 MachineBasicBlock *
MBB =
MI.getParent();
631 InstrInfo
II = Instructions[&
MI];
632 BlockInfo &BI = Blocks[
MBB];
636 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
637 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
638 Instructions[&
MI].Needs = StateWQM;
643 if (
II.Needs & StateWQM) {
644 BI.Needs |= StateWQM;
645 if (!(BI.InNeeds & StateWQM)) {
646 BI.InNeeds |= StateWQM;
647 Worklist.emplace_back(
MBB);
652 if (MachineInstr *PrevMI =
MI.getPrevNode()) {
653 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
654 if (!PrevMI->isPHI()) {
655 InstrInfo &PrevII = Instructions[PrevMI];
656 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
657 PrevII.OutNeeds |= InNeeds;
658 Worklist.emplace_back(PrevMI);
667 markInstructionUses(
MI,
II.Needs, Worklist);
671 if (
II.Needs & StateStrictWWM)
672 BI.Needs |= StateStrictWWM;
673 if (
II.Needs & StateStrictWQM)
674 BI.Needs |= StateStrictWQM;
677void SIWholeQuadMode::propagateBlock(MachineBasicBlock &
MBB,
678 std::vector<WorkItem>& Worklist) {
679 BlockInfo BI = Blocks[&
MBB];
684 InstrInfo &LastII = Instructions[LastMI];
685 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
686 LastII.OutNeeds |= BI.OutNeeds;
687 Worklist.emplace_back(LastMI);
693 BlockInfo &PredBI = Blocks[Pred];
694 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
697 PredBI.OutNeeds |= BI.InNeeds;
698 PredBI.InNeeds |= BI.InNeeds;
699 Worklist.emplace_back(Pred);
704 BlockInfo &SuccBI = Blocks[Succ];
705 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
708 SuccBI.InNeeds |= BI.OutNeeds;
709 Worklist.emplace_back(Succ);
713char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
714 std::vector<WorkItem> Worklist;
715 char GlobalFlags = scanInstructions(MF, Worklist);
717 while (!Worklist.empty()) {
718 WorkItem WI = Worklist.back();
722 propagateInstruction(*WI.MI, Worklist);
724 propagateBlock(*WI.MBB, Worklist);
731SIWholeQuadMode::saveSCC(MachineBasicBlock &
MBB,
733 Register SaveReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
738 MachineInstr *Restore =
749void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
750 MachineBasicBlock *BB = TermMI->
getParent();
754 MachineBasicBlock *SplitBB =
755 BB->
splitAt(*TermMI,
true, LIS);
759 unsigned NewOpcode = 0;
761 case AMDGPU::S_AND_B32:
762 NewOpcode = AMDGPU::S_AND_B32_term;
764 case AMDGPU::S_AND_B64:
765 NewOpcode = AMDGPU::S_AND_B64_term;
767 case AMDGPU::S_MOV_B32:
768 NewOpcode = AMDGPU::S_MOV_B32_term;
770 case AMDGPU::S_MOV_B64:
771 NewOpcode = AMDGPU::S_MOV_B64_term;
773 case AMDGPU::S_ANDN2_B32:
774 NewOpcode = AMDGPU::S_ANDN2_B32_term;
776 case AMDGPU::S_ANDN2_B64:
777 NewOpcode = AMDGPU::S_ANDN2_B64_term;
791 for (MachineBasicBlock *Succ : SplitBB->
successors()) {
792 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
793 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
795 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
803MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &
MI) {
818 switch (
MI.getOperand(2).getImm()) {
820 Opcode = AMDGPU::V_CMP_LG_F32_e64;
823 Opcode = AMDGPU::V_CMP_GE_F32_e64;
826 Opcode = AMDGPU::V_CMP_GT_F32_e64;
829 Opcode = AMDGPU::V_CMP_LE_F32_e64;
832 Opcode = AMDGPU::V_CMP_LT_F32_e64;
835 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
838 Opcode = AMDGPU::V_CMP_O_F32_e64;
841 Opcode = AMDGPU::V_CMP_U_F32_e64;
845 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
849 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
853 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
857 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
861 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
865 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
871 MachineBasicBlock &
MBB = *
MI.getParent();
874 MachineInstr *VcmpMI;
875 const MachineOperand &Op0 =
MI.getOperand(0);
876 const MachineOperand &Op1 =
MI.getOperand(1);
892 MachineInstr *MaskUpdateMI =
899 MachineInstr *EarlyTermMI =
902 MachineInstr *ExecMaskMI =
920MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &
MI,
bool IsWQM) {
923 MachineBasicBlock &
MBB = *
MI.getParent();
926 MachineInstr *MaskUpdateMI =
nullptr;
928 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
929 const MachineOperand &
Op =
MI.getOperand(0);
930 int64_t KillVal =
MI.getOperand(1).getImm();
931 MachineInstr *ComputeKilledMaskMI =
nullptr;
937 if (
Op.getImm() == KillVal) {
944 bool IsLastTerminator = std::next(
MI.getIterator()) ==
MBB.
end();
945 if (!IsLastTerminator) {
960 TmpReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
977 MachineInstr *EarlyTermMI =
982 MachineInstr *NewTerm;
983 MachineInstr *WQMMaskMI =
nullptr;
987 LiveMaskWQM =
MRI->createVirtualRegister(
TRI->getBoolRC());
1016 if (ComputeKilledMaskMI)
1039void SIWholeQuadMode::lowerBlock(MachineBasicBlock &
MBB, BlockInfo &BI) {
1040 if (!BI.NeedsLowering)
1045 SmallVector<MachineInstr *, 4> SplitPoints;
1047 char State = BI.InitialState;
1051 auto MIState = StateTransition.find(&
MI);
1052 if (MIState != StateTransition.end())
1053 State = MIState->second;
1055 MachineInstr *SplitPoint =
nullptr;
1056 switch (
MI.getOpcode()) {
1057 case AMDGPU::SI_DEMOTE_I1:
1058 case AMDGPU::SI_KILL_I1_TERMINATOR:
1059 SplitPoint = lowerKillI1(
MI, State == StateWQM);
1061 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1062 SplitPoint = lowerKillF32(
MI);
1064 case AMDGPU::ENTER_STRICT_WWM:
1065 ActiveLanesReg =
MI.getOperand(0).getReg();
1067 case AMDGPU::EXIT_STRICT_WWM:
1070 case AMDGPU::V_SET_INACTIVE_B32:
1071 if (ActiveLanesReg) {
1072 LiveInterval &LI = LIS->
getInterval(
MI.getOperand(5).getReg());
1073 MRI->constrainRegClass(ActiveLanesReg,
TRI->getWaveMaskRegClass());
1074 MI.getOperand(5).setReg(ActiveLanesReg);
1077 assert(State == StateExact || State == StateWQM);
1088 for (MachineInstr *
MI : SplitPoints)
1108 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1109 const LiveRange::Segment *S;
1118 if (
Next < FirstIdx)
1123 assert(EndMI &&
"Segment does not end on valid instruction");
1147 bool IsExecDef =
false;
1148 for (
const MachineOperand &MO :
MBBI->all_defs()) {
1150 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1164void SIWholeQuadMode::toExact(MachineBasicBlock &
MBB,
1169 bool IsTerminator = Before ==
MBB.
end();
1170 if (!IsTerminator) {
1172 if (FirstTerm !=
MBB.
end()) {
1175 IsTerminator = BeforeIdx > FirstTermIdx;
1194 StateTransition[
MI] = StateExact;
1197void SIWholeQuadMode::toWQM(MachineBasicBlock &
MBB,
1211 StateTransition[
MI] = StateWQM;
1214void SIWholeQuadMode::toStrictMode(MachineBasicBlock &
MBB,
1216 Register SaveOrig,
char StrictStateNeeded) {
1219 assert(StrictStateNeeded == StateStrictWWM ||
1220 StrictStateNeeded == StateStrictWQM);
1222 if (StrictStateNeeded == StateStrictWWM) {
1232 StateTransition[
MI] = StrictStateNeeded;
1235void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &
MBB,
1237 Register SavedOrig,
char NonStrictState,
1238 char CurrentStrictState) {
1242 assert(CurrentStrictState == StateStrictWWM ||
1243 CurrentStrictState == StateStrictWQM);
1245 if (CurrentStrictState == StateStrictWWM) {
1255 StateTransition[
MI] = NonStrictState;
1258void SIWholeQuadMode::processBlock(MachineBasicBlock &
MBB, BlockInfo &BI,
1262 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1263 BI.InitialState = StateWQM;
1272 bool WQMFromExec = IsEntry;
1273 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1274 char NonStrictState = 0;
1275 const TargetRegisterClass *BoolRC =
TRI->getBoolRC();
1280 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1281 II->getOperand(1).getReg() == LMC.
ExecReg)
1296 BI.InitialState = State;
1298 for (
unsigned Idx = 0;; ++Idx) {
1300 char Needs = StateExact | StateWQM;
1306 if (FirstStrict == IE)
1310 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1316 MachineInstr &
MI = *
II;
1318 if (
MI.isTerminator() ||
TII->mayReadEXEC(*
MRI,
MI)) {
1319 auto III = Instructions.find(&
MI);
1320 if (III != Instructions.end()) {
1321 if (III->second.Needs & StateStrictWWM)
1322 Needs = StateStrictWWM;
1323 else if (III->second.Needs & StateStrictWQM)
1324 Needs = StateStrictWQM;
1325 else if (III->second.Needs & StateWQM)
1328 Needs &= ~III->second.Disabled;
1329 OutNeeds = III->second.OutNeeds;
1334 Needs = StateExact | StateWQM | StateStrict;
1338 if (
MI.isBranch() && OutNeeds == StateExact)
1344 if (BI.OutNeeds & StateWQM)
1346 else if (BI.OutNeeds == StateExact)
1349 Needs = StateWQM | StateExact;
1353 if (!(Needs & State)) {
1355 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1356 State == StateStrictWQM || Needs == StateStrictWQM) {
1358 First = FirstStrict;
1365 bool SaveSCC =
false;
1368 case StateStrictWWM:
1369 case StateStrictWQM:
1373 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1377 SaveSCC = !(Needs & StateWQM);
1383 char StartState = State & StateStrict ? NonStrictState : State;
1385 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1386 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1387 !(Needs & StateExact);
1388 bool PreferLast = Needs == StateWQM;
1393 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1395 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1396 PreferLast = WQMToExact;
1402 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1404 if (State & StateStrict) {
1405 assert(State == StateStrictWWM || State == StateStrictWQM);
1406 assert(SavedNonStrictReg);
1407 fromStrictMode(
MBB, Before, SavedNonStrictReg, NonStrictState, State);
1410 SavedNonStrictReg = 0;
1411 State = NonStrictState;
1414 if (Needs & StateStrict) {
1415 NonStrictState = State;
1416 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1417 assert(!SavedNonStrictReg);
1418 SavedNonStrictReg =
MRI->createVirtualRegister(BoolRC);
1420 toStrictMode(
MBB, Before, SavedNonStrictReg, Needs);
1424 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1426 SavedWQMReg =
MRI->createVirtualRegister(BoolRC);
1429 toExact(
MBB, Before, SavedWQMReg);
1431 }
else if (ExactToWQM) {
1432 assert(WQMFromExec == (SavedWQMReg == 0));
1434 toWQM(
MBB, Before, SavedWQMReg);
1450 if (Needs != (StateExact | StateWQM | StateStrict)) {
1451 if (Needs != (StateExact | StateWQM))
1462 assert(!SavedNonStrictReg);
1465bool SIWholeQuadMode::lowerLiveMaskQueries() {
1466 for (MachineInstr *
MI : LiveMaskQueries) {
1470 MachineInstr *
Copy =
1475 MI->eraseFromParent();
1477 return !LiveMaskQueries.empty();
1480bool SIWholeQuadMode::lowerCopyInstrs() {
1481 for (MachineInstr *
MI : LowerToMovInstrs) {
1482 assert(
MI->getNumExplicitOperands() == 2);
1486 const TargetRegisterClass *regClass =
1487 TRI->getRegClassForOperandReg(*
MRI,
MI->getOperand(0));
1488 if (
TRI->isVGPRClass(regClass)) {
1489 const unsigned MovOp =
TII->getMovOpcode(regClass);
1490 MI->setDesc(
TII->get(MovOp));
1494 assert(
any_of(
MI->implicit_operands(), [](
const MachineOperand &MO) {
1495 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1501 if (
MI->getOperand(0).isEarlyClobber()) {
1503 MI->getOperand(0).setIsEarlyClobber(
false);
1506 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1507 while (Index >= 0) {
1508 MI->removeOperand(Index);
1509 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1511 MI->setDesc(
TII->get(AMDGPU::COPY));
1515 for (MachineInstr *
MI : LowerToCopyInstrs) {
1518 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1519 assert(
MI->getNumExplicitOperands() == 6);
1521 LiveInterval *RecomputeLI =
nullptr;
1522 if (
MI->getOperand(4).isReg())
1523 RecomputeLI = &LIS->
getInterval(
MI->getOperand(4).getReg());
1525 MI->removeOperand(5);
1526 MI->removeOperand(4);
1527 MI->removeOperand(3);
1528 MI->removeOperand(1);
1533 assert(
MI->getNumExplicitOperands() == 2);
1536 unsigned CopyOp =
MI->getOperand(1).isReg()
1537 ? (unsigned)AMDGPU::COPY
1538 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1539 *
MRI,
MI->getOperand(0)));
1540 MI->setDesc(
TII->get(CopyOp));
1543 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1546bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1547 for (MachineInstr *
MI : KillInstrs) {
1548 MachineInstr *SplitPoint =
nullptr;
1549 switch (
MI->getOpcode()) {
1550 case AMDGPU::SI_DEMOTE_I1:
1551 case AMDGPU::SI_KILL_I1_TERMINATOR:
1552 SplitPoint = lowerKillI1(*
MI, IsWQM);
1554 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1555 SplitPoint = lowerKillF32(*
MI);
1561 return !KillInstrs.empty();
1564void SIWholeQuadMode::lowerInitExec(MachineInstr &
MI) {
1565 MachineBasicBlock *
MBB =
MI.getParent();
1567 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1569 "init whole wave not in entry block");
1570 Register EntryExec =
MRI->createVirtualRegister(
TRI->getBoolRC());
1576 MRI->replaceRegWith(
MI.getOperand(0).getReg(), EntryExec);
1582 MI.eraseFromParent();
1591 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1595 .
addImm(
MI.getOperand(0).getImm());
1600 MI.eraseFromParent();
1611 Register InputReg =
MI.getOperand(0).getReg();
1612 MachineInstr *FirstMI = &*
MBB->
begin();
1614 MachineInstr *DefInstr =
MRI->getVRegDef(InputReg);
1617 if (DefInstr != FirstMI) {
1635 Register CountReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1636 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1638 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1642 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1649 MI.eraseFromParent();
1654 MI.eraseFromParent();
1669SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry,
bool &
Changed) {
1672 for (MachineInstr *
MI : InitExecInstrs) {
1676 if (
MI->getParent() == &Entry)
1677 InsertPt = std::next(
MI->getIterator());
1686bool SIWholeQuadMode::run(MachineFunction &MF) {
1688 <<
" ------------- \n");
1691 Instructions.clear();
1693 LiveMaskQueries.clear();
1694 LowerToCopyInstrs.clear();
1695 LowerToMovInstrs.clear();
1697 InitExecInstrs.clear();
1698 SetInactiveInstrs.
clear();
1699 StateTransition.clear();
1710 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1711 const bool HasWaveModes = GlobalFlags & ~StateExact;
1712 const bool HasKills = !KillInstrs.empty();
1713 const bool UsesWQM = GlobalFlags & StateWQM;
1714 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1715 LiveMaskReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
1725 for (MachineInstr *
MI : SetInactiveInstrs) {
1726 if (LowerToCopyInstrs.contains(
MI))
1728 auto &
Info = Instructions[
MI];
1729 if (
Info.MarkedStates & StateStrict) {
1730 Info.Needs |= StateStrictWWM;
1731 Info.Disabled &= ~StateStrictWWM;
1732 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1735 LowerToCopyInstrs.insert(
MI);
1741 Changed |= lowerLiveMaskQueries();
1744 if (!HasWaveModes) {
1746 Changed |= lowerKillInstrs(
false);
1747 }
else if (GlobalFlags == StateWQM) {
1753 lowerKillInstrs(
true);
1757 if (GlobalFlags & StateWQM)
1758 Blocks[&
Entry].InNeeds |= StateWQM;
1760 for (
auto &BII : Blocks)
1761 processBlock(*BII.first, BII.second, BII.first == &Entry);
1763 for (
auto &BII : Blocks)
1764 lowerBlock(*BII.first, BII.second);
1769 if (LiveMaskReg != LMC.
ExecReg)
1778 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1784bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1785 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1786 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1787 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() :
nullptr;
1789 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1790 MachinePostDominatorTree *PDT =
1791 PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1792 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1793 return Impl.run(MF);
1806 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
Analysis containing CSE Info
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
SI Optimize VGPR LiveRange
unsigned getWavefrontSize() const
const unsigned AndSaveExecTermOpc
const unsigned AndTermOpc
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned OrSaveExecOpc
const unsigned AndSaveExecOpc
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool hasExtendedImageInsts() const
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
pred_iterator pred_begin()
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Wrapper class representing virtual and physical registers.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
DominatorTreeBase< T, false > DomTreeBase
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
unsigned MCRegUnit
Register units are used to compute register aliasing.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionPass * createSIWholeQuadModeLegacyPass()
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
WorkItem(const BasicBlock *BB, int St)
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()