88#define DEBUG_TYPE "si-wqm"
97 StateStrict = StateStrictWWM | StateStrictWQM,
104 explicit PrintState(
int State) : State(State) {}
110 static const std::pair<char, const char *> Mapping[] = {
111 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
112 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
113 char State = PS.State;
114 for (
auto M : Mapping) {
115 if (State & M.first) {
132 char MarkedStates = 0;
139 char InitialState = 0;
140 bool NeedsLowering =
false;
152class SIWholeQuadMode {
174 unsigned AndSaveExecOpc;
175 unsigned AndSaveExecTermOpc;
196 std::vector<WorkItem> &Worklist);
198 unsigned SubReg,
char Flag, std::vector<WorkItem> &Worklist);
200 std::vector<WorkItem> &Worklist);
202 std::vector<WorkItem> &Worklist);
203 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist);
204 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
219 Register SaveOrig,
char StrictStateNeeded);
222 char NonStrictState,
char CurrentStrictState);
231 bool lowerLiveMaskQueries();
232 bool lowerCopyInstrs();
233 bool lowerKillInstrs(
bool IsWQM);
264char SIWholeQuadModeLegacy::ID = 0;
277 return new SIWholeQuadModeLegacy;
282 for (
const auto &BII :
Blocks) {
285 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
286 <<
", Needs = " << PrintState(BII.second.Needs)
287 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
290 auto III = Instructions.find(&
MI);
291 if (III != Instructions.end()) {
292 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
293 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
301 std::vector<WorkItem> &Worklist) {
304 assert(!(Flag & StateExact) && Flag != 0);
313 Flag &= ~II.Disabled;
317 if ((
II.Needs & Flag) == Flag)
322 Worklist.emplace_back(&
MI);
328 std::vector<WorkItem> &Worklist) {
340 : (
Reg.isVirtual() ?
MRI->getMaxLaneMaskForVReg(Reg)
352 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
354 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
358 unsigned NextPredIdx = 0;
360 const VNInfo *NextValue =
nullptr;
361 const VisitKey
Key(
Value, DefinedLanes);
363 if (Visited.
insert(Key).second) {
368 if (
Value->isPHIDef()) {
371 assert(
MBB &&
"Phi-def has no defining MBB");
374 unsigned Idx = NextPredIdx;
377 for (; PI != PE && !NextValue; ++PI, ++
Idx) {
379 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
389 assert(
MI &&
"Def has no defining instruction");
391 if (
Reg.isVirtual()) {
395 if (
Op.getReg() != Reg)
401 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
405 HasDef |= Overlap.
any();
408 DefinedLanes |= OpLanes;
412 if ((DefinedLanes & UseLanes) != UseLanes) {
416 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
423 markInstruction(*
MI, Flag, Worklist);
426 markInstruction(*
MI, Flag, Worklist);
430 if (!NextValue && !PhiStack.
empty()) {
433 NextValue =
Entry.Phi;
434 NextPredIdx =
Entry.PredIdx;
435 DefinedLanes =
Entry.DefinedLanes;
445 std::vector<WorkItem> &Worklist) {
452 case AMDGPU::EXEC_LO:
460 if (
Reg.isVirtual()) {
462 markDefs(
MI, LR, Reg,
Op.getSubReg(), Flag, Worklist);
471 markDefs(
MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
477void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
478 std::vector<WorkItem> &Worklist) {
479 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
483 markOperand(
MI,
Use, Flag, Worklist);
489 std::vector<WorkItem> &Worklist) {
490 char GlobalFlags = 0;
493 bool HasImplicitDerivatives =
506 unsigned Opcode =
MI.getOpcode();
509 if (
TII->isWQM(Opcode)) {
514 if (
ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
518 markInstructionUses(
MI, StateWQM, Worklist);
519 GlobalFlags |= StateWQM;
521 }
else if (Opcode == AMDGPU::WQM) {
525 LowerToCopyInstrs.insert(&
MI);
526 }
else if (Opcode == AMDGPU::SOFT_WQM) {
527 LowerToCopyInstrs.insert(&
MI);
529 }
else if (Opcode == AMDGPU::STRICT_WWM) {
533 markInstructionUses(
MI, StateStrictWWM, Worklist);
534 GlobalFlags |= StateStrictWWM;
535 LowerToMovInstrs.push_back(&
MI);
536 }
else if (Opcode == AMDGPU::STRICT_WQM ||
537 TII->isDualSourceBlendEXP(
MI)) {
541 markInstructionUses(
MI, StateStrictWQM, Worklist);
542 GlobalFlags |= StateStrictWQM;
544 if (Opcode == AMDGPU::STRICT_WQM) {
545 LowerToMovInstrs.push_back(&
MI);
550 BBI.Needs |= StateExact;
551 if (!(BBI.InNeeds & StateExact)) {
552 BBI.InNeeds |= StateExact;
553 Worklist.emplace_back(
MBB);
555 GlobalFlags |= StateExact;
556 III.Disabled = StateWQM | StateStrict;
558 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
559 Opcode == AMDGPU::DS_PARAM_LOAD ||
560 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
561 Opcode == AMDGPU::DS_DIRECT_LOAD) {
564 III.Needs |= StateStrictWQM;
565 GlobalFlags |= StateStrictWQM;
566 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
568 III.Disabled = StateStrict;
570 if (Inactive.
isReg()) {
571 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
572 LowerToCopyInstrs.insert(&
MI);
574 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
576 SetInactiveInstrs.push_back(&
MI);
577 BBI.NeedsLowering =
true;
578 }
else if (
TII->isDisableWQM(
MI)) {
579 BBI.Needs |= StateExact;
580 if (!(BBI.InNeeds & StateExact)) {
581 BBI.InNeeds |= StateExact;
582 Worklist.emplace_back(
MBB);
584 GlobalFlags |= StateExact;
585 III.Disabled = StateWQM | StateStrict;
586 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
587 Opcode == AMDGPU::SI_LIVE_MASK) {
588 LiveMaskQueries.push_back(&
MI);
589 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
590 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
591 Opcode == AMDGPU::SI_DEMOTE_I1) {
592 KillInstrs.push_back(&
MI);
593 BBI.NeedsLowering =
true;
594 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
595 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
596 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
597 InitExecInstrs.push_back(&
MI);
598 }
else if (WQMOutputs) {
605 if (
Reg.isPhysical() &&
606 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(Reg))) {
614 markInstruction(
MI, Flags, Worklist);
615 GlobalFlags |=
Flags;
624 if (GlobalFlags & StateWQM) {
626 markInstruction(*
MI, StateWQM, Worklist);
628 markInstruction(*
MI, StateWQM, Worklist);
635 std::vector<WorkItem>& Worklist) {
642 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
643 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
649 if (
II.Needs & StateWQM) {
650 BI.Needs |= StateWQM;
651 if (!(BI.InNeeds & StateWQM)) {
652 BI.InNeeds |= StateWQM;
653 Worklist.emplace_back(
MBB);
659 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
660 if (!PrevMI->isPHI()) {
662 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
663 PrevII.OutNeeds |= InNeeds;
664 Worklist.emplace_back(PrevMI);
673 markInstructionUses(
MI,
II.Needs, Worklist);
677 if (
II.Needs & StateStrictWWM)
678 BI.Needs |= StateStrictWWM;
679 if (
II.Needs & StateStrictWQM)
680 BI.Needs |= StateStrictWQM;
684 std::vector<WorkItem>& Worklist) {
691 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
692 LastII.OutNeeds |= BI.OutNeeds;
693 Worklist.emplace_back(LastMI);
699 BlockInfo &PredBI =
Blocks[Pred];
700 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
703 PredBI.OutNeeds |= BI.InNeeds;
704 PredBI.InNeeds |= BI.InNeeds;
705 Worklist.emplace_back(Pred);
710 BlockInfo &SuccBI =
Blocks[Succ];
711 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
714 SuccBI.InNeeds |= BI.OutNeeds;
715 Worklist.emplace_back(Succ);
720 std::vector<WorkItem> Worklist;
721 char GlobalFlags = scanInstructions(MF, Worklist);
723 while (!Worklist.empty()) {
728 propagateInstruction(*WI.MI, Worklist);
730 propagateBlock(*WI.MBB, Worklist);
739 Register SaveReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
748 LIS->InsertMachineInstrInMaps(*Save);
749 LIS->InsertMachineInstrInMaps(*Restore);
750 LIS->createAndComputeVirtRegInterval(SaveReg);
761 BB->
splitAt(*TermMI,
true, LIS);
765 unsigned NewOpcode = 0;
767 case AMDGPU::S_AND_B32:
768 NewOpcode = AMDGPU::S_AND_B32_term;
770 case AMDGPU::S_AND_B64:
771 NewOpcode = AMDGPU::S_AND_B64_term;
773 case AMDGPU::S_MOV_B32:
774 NewOpcode = AMDGPU::S_MOV_B32_term;
776 case AMDGPU::S_MOV_B64:
777 NewOpcode = AMDGPU::S_MOV_B64_term;
779 case AMDGPU::S_ANDN2_B32:
780 NewOpcode = AMDGPU::S_ANDN2_B32_term;
782 case AMDGPU::S_ANDN2_B64:
783 NewOpcode = AMDGPU::S_ANDN2_B64_term;
798 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
799 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
801 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
803 MDT->applyUpdates(DTUpdates);
805 PDT->applyUpdates(DTUpdates);
810 assert(LiveMaskReg.isVirtual());
824 switch (
MI.getOperand(2).getImm()) {
826 Opcode = AMDGPU::V_CMP_LG_F32_e64;
829 Opcode = AMDGPU::V_CMP_GE_F32_e64;
832 Opcode = AMDGPU::V_CMP_GT_F32_e64;
835 Opcode = AMDGPU::V_CMP_LE_F32_e64;
838 Opcode = AMDGPU::V_CMP_LT_F32_e64;
841 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
844 Opcode = AMDGPU::V_CMP_O_F32_e64;
847 Opcode = AMDGPU::V_CMP_U_F32_e64;
851 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
855 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
859 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
863 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
867 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
871 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
885 Register VCC =
ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
916 LIS->ReplaceMachineInstrInMaps(
MI, *VcmpMI);
919 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
920 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
921 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
927 assert(LiveMaskReg.isVirtual());
934 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
936 int64_t KillVal =
MI.getOperand(1).getImm();
943 if (
Op.getImm() == KillVal) {
950 bool IsLastTerminator = std::next(
MI.getIterator()) ==
MBB.
end();
951 if (!IsLastTerminator) {
952 LIS->RemoveMachineInstrFromMaps(
MI);
957 LIS->ReplaceMachineInstrInMaps(
MI, *NewTerm);
966 TmpReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
967 ComputeKilledMaskMI =
992 LiveMaskWQM =
MRI->createVirtualRegister(
TRI->getBoolRC());
1001 unsigned MovOpc =
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1003 }
else if (!IsWQM) {
1008 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1015 LIS->RemoveMachineInstrFromMaps(
MI);
1020 if (ComputeKilledMaskMI)
1021 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1022 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1023 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1025 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1026 LIS->InsertMachineInstrInMaps(*NewTerm);
1029 LIS->removeInterval(CndReg);
1030 LIS->createAndComputeVirtRegInterval(CndReg);
1033 LIS->createAndComputeVirtRegInterval(TmpReg);
1035 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1044 if (!BI.NeedsLowering)
1051 char State = BI.InitialState;
1055 auto MIState = StateTransition.find(&
MI);
1056 if (MIState != StateTransition.end())
1057 State = MIState->second;
1060 switch (
MI.getOpcode()) {
1061 case AMDGPU::SI_DEMOTE_I1:
1062 case AMDGPU::SI_KILL_I1_TERMINATOR:
1063 SplitPoint = lowerKillI1(
MI, State == StateWQM);
1065 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1066 SplitPoint = lowerKillF32(
MI);
1068 case AMDGPU::ENTER_STRICT_WWM:
1069 ActiveLanesReg =
MI.getOperand(0).getReg();
1071 case AMDGPU::EXIT_STRICT_WWM:
1074 case AMDGPU::V_SET_INACTIVE_B32:
1075 if (ActiveLanesReg) {
1077 MRI->constrainRegClass(ActiveLanesReg,
TRI->getWaveMaskRegClass());
1078 MI.getOperand(5).setReg(ActiveLanesReg);
1079 LIS->shrinkToUses(&LI);
1081 assert(State == StateExact || State == StateWQM);
1109 : LIS->getMBBEndIdx(&
MBB);
1111 Last != MBBE ? LIS->getInstructionIndex(*
Last) : LIS->getMBBEndIdx(&
MBB);
1122 if (Next < FirstIdx)
1127 assert(EndMI &&
"Segment does not end on valid instruction");
1131 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1151 bool IsExecDef =
false;
1154 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1171 assert(LiveMaskReg.isVirtual());
1173 bool IsTerminator = Before ==
MBB.
end();
1174 if (!IsTerminator) {
1176 if (FirstTerm !=
MBB.
end()) {
1177 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1178 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1179 IsTerminator = BeforeIdx > FirstTermIdx;
1186 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1190 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1196 LIS->InsertMachineInstrInMaps(*
MI);
1197 StateTransition[
MI] = StateExact;
1212 LIS->InsertMachineInstrInMaps(*
MI);
1213 StateTransition[
MI] = StateWQM;
1218 Register SaveOrig,
char StrictStateNeeded) {
1221 assert(StrictStateNeeded == StateStrictWWM ||
1222 StrictStateNeeded == StateStrictWQM);
1224 if (StrictStateNeeded == StateStrictWWM) {
1233 LIS->InsertMachineInstrInMaps(*
MI);
1234 StateTransition[
MI] = StrictStateNeeded;
1239 Register SavedOrig,
char NonStrictState,
1240 char CurrentStrictState) {
1244 assert(CurrentStrictState == StateStrictWWM ||
1245 CurrentStrictState == StateStrictWQM);
1247 if (CurrentStrictState == StateStrictWWM) {
1256 LIS->InsertMachineInstrInMaps(*
MI);
1257 StateTransition[
MI] = NonStrictState;
1264 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1265 BI.InitialState = StateWQM;
1274 bool WQMFromExec = IsEntry;
1275 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1276 char NonStrictState = 0;
1282 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1283 II->getOperand(1).getReg() ==
TRI->getExec())
1298 BI.InitialState = State;
1300 for (
unsigned Idx = 0;; ++
Idx) {
1302 char Needs = StateExact | StateWQM;
1308 if (FirstStrict == IE)
1312 if (IsEntry &&
Idx == 0 && (BI.InNeeds & StateWQM))
1320 if (
MI.isTerminator() ||
TII->mayReadEXEC(*
MRI,
MI)) {
1323 if (III->second.Needs & StateStrictWWM)
1324 Needs = StateStrictWWM;
1325 else if (III->second.Needs & StateStrictWQM)
1326 Needs = StateStrictWQM;
1327 else if (III->second.Needs & StateWQM)
1330 Needs &= ~III->second.Disabled;
1331 OutNeeds = III->second.OutNeeds;
1336 Needs = StateExact | StateWQM | StateStrict;
1340 if (
MI.isBranch() && OutNeeds == StateExact)
1346 if (BI.OutNeeds & StateWQM)
1348 else if (BI.OutNeeds == StateExact)
1351 Needs = StateWQM | StateExact;
1355 if (!(Needs & State)) {
1357 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1358 State == StateStrictWQM || Needs == StateStrictWQM) {
1360 First = FirstStrict;
1367 bool SaveSCC =
false;
1370 case StateStrictWWM:
1371 case StateStrictWQM:
1375 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1379 SaveSCC = !(Needs & StateWQM);
1385 char StartState = State & StateStrict ? NonStrictState : State;
1387 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1388 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1389 !(Needs & StateExact);
1390 bool PreferLast = Needs == StateWQM;
1395 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1397 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1398 PreferLast = WQMToExact;
1404 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1406 if (State & StateStrict) {
1407 assert(State == StateStrictWWM || State == StateStrictWQM);
1408 assert(SavedNonStrictReg);
1409 fromStrictMode(
MBB, Before, SavedNonStrictReg, NonStrictState, State);
1411 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1412 SavedNonStrictReg = 0;
1413 State = NonStrictState;
1416 if (Needs & StateStrict) {
1417 NonStrictState = State;
1418 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1419 assert(!SavedNonStrictReg);
1420 SavedNonStrictReg =
MRI->createVirtualRegister(BoolRC);
1422 toStrictMode(
MBB, Before, SavedNonStrictReg, Needs);
1426 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1428 SavedWQMReg =
MRI->createVirtualRegister(BoolRC);
1431 toExact(
MBB, Before, SavedWQMReg);
1433 }
else if (ExactToWQM) {
1434 assert(WQMFromExec == (SavedWQMReg == 0));
1436 toWQM(
MBB, Before, SavedWQMReg);
1439 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1452 if (Needs != (StateExact | StateWQM | StateStrict)) {
1453 if (Needs != (StateExact | StateWQM))
1464 assert(!SavedNonStrictReg);
1467bool SIWholeQuadMode::lowerLiveMaskQueries() {
1476 LIS->ReplaceMachineInstrInMaps(*
MI, *Copy);
1477 MI->eraseFromParent();
1479 return !LiveMaskQueries.empty();
1482bool SIWholeQuadMode::lowerCopyInstrs() {
1484 assert(
MI->getNumExplicitOperands() == 2);
1489 TRI->getRegClassForOperandReg(*
MRI,
MI->getOperand(0));
1490 if (
TRI->isVGPRClass(regClass)) {
1491 const unsigned MovOp =
TII->getMovOpcode(regClass);
1492 MI->setDesc(
TII->get(MovOp));
1497 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1503 if (
MI->getOperand(0).isEarlyClobber()) {
1504 LIS->removeInterval(Reg);
1505 MI->getOperand(0).setIsEarlyClobber(
false);
1506 LIS->createAndComputeVirtRegInterval(Reg);
1508 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1509 while (Index >= 0) {
1510 MI->removeOperand(Index);
1511 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1513 MI->setDesc(
TII->get(AMDGPU::COPY));
1520 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1521 assert(
MI->getNumExplicitOperands() == 6);
1524 if (
MI->getOperand(4).isReg())
1525 RecomputeLI = &LIS->getInterval(
MI->getOperand(4).getReg());
1527 MI->removeOperand(5);
1528 MI->removeOperand(4);
1529 MI->removeOperand(3);
1530 MI->removeOperand(1);
1533 LIS->shrinkToUses(RecomputeLI);
1535 assert(
MI->getNumExplicitOperands() == 2);
1538 unsigned CopyOp =
MI->getOperand(1).isReg()
1540 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1541 *
MRI,
MI->getOperand(0)));
1542 MI->setDesc(
TII->get(CopyOp));
1545 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1548bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1551 switch (
MI->getOpcode()) {
1552 case AMDGPU::SI_DEMOTE_I1:
1553 case AMDGPU::SI_KILL_I1_TERMINATOR:
1554 SplitPoint = lowerKillI1(*
MI, IsWQM);
1556 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1557 SplitPoint = lowerKillF32(*
MI);
1563 return !KillInstrs.empty();
1568 bool IsWave32 =
ST->isWave32();
1570 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1572 "init whole wave not in entry block");
1573 Register EntryExec =
MRI->createVirtualRegister(
TRI->getBoolRC());
1576 TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1577 : AMDGPU::S_OR_SAVEEXEC_B64),
1582 MRI->replaceRegWith(
MI.getOperand(0).getReg(), EntryExec);
1585 LIS->RemoveMachineInstrFromMaps(
MI);
1588 MI.eraseFromParent();
1591 LIS->InsertMachineInstrInMaps(*SaveExec);
1592 LIS->createAndComputeVirtRegInterval(EntryExec);
1597 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1601 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1603 .
addImm(
MI.getOperand(0).getImm());
1605 LIS->RemoveMachineInstrFromMaps(
MI);
1606 LIS->InsertMachineInstrInMaps(*InitMI);
1608 MI.eraseFromParent();
1619 Register InputReg =
MI.getOperand(0).getReg();
1625 if (DefInstr != FirstMI) {
1631 LIS->handleMove(*DefInstr);
1643 Register CountReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1644 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1646 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1649 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1652 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1657 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1662 MI.eraseFromParent();
1666 LIS->RemoveMachineInstrFromMaps(
MI);
1667 MI.eraseFromParent();
1669 LIS->InsertMachineInstrInMaps(*BfeMI);
1670 LIS->InsertMachineInstrInMaps(*BfmMI);
1671 LIS->InsertMachineInstrInMaps(*CmpMI);
1672 LIS->InsertMachineInstrInMaps(*CmovMI);
1674 LIS->removeInterval(InputReg);
1675 LIS->createAndComputeVirtRegInterval(InputReg);
1676 LIS->createAndComputeVirtRegInterval(CountReg);
1689 if (
MI->getParent() == &Entry)
1690 InsertPt = std::next(
MI->getIterator());
1701 <<
" ------------- \n");
1706 LiveMaskQueries.clear();
1707 LowerToCopyInstrs.clear();
1708 LowerToMovInstrs.clear();
1710 InitExecInstrs.clear();
1711 SetInactiveInstrs.clear();
1712 StateTransition.clear();
1714 if (
ST->isWave32()) {
1715 AndOpc = AMDGPU::S_AND_B32;
1716 AndTermOpc = AMDGPU::S_AND_B32_term;
1717 AndN2Opc = AMDGPU::S_ANDN2_B32;
1718 XorOpc = AMDGPU::S_XOR_B32;
1719 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1720 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1721 WQMOpc = AMDGPU::S_WQM_B32;
1722 Exec = AMDGPU::EXEC_LO;
1724 AndOpc = AMDGPU::S_AND_B64;
1725 AndTermOpc = AMDGPU::S_AND_B64_term;
1726 AndN2Opc = AMDGPU::S_ANDN2_B64;
1727 XorOpc = AMDGPU::S_XOR_B64;
1728 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1729 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1730 WQMOpc = AMDGPU::S_WQM_B64;
1731 Exec = AMDGPU::EXEC;
1735 bool Changed =
false;
1743 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1744 const bool HasWaveModes = GlobalFlags & ~StateExact;
1745 const bool HasKills = !KillInstrs.empty();
1746 const bool UsesWQM = GlobalFlags & StateWQM;
1747 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1748 LiveMaskReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
1752 LIS->InsertMachineInstrInMaps(*
MI);
1759 if (LowerToCopyInstrs.contains(
MI))
1762 if (
Info.MarkedStates & StateStrict) {
1763 Info.Needs |= StateStrictWWM;
1764 Info.Disabled &= ~StateStrictWWM;
1765 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1768 LowerToCopyInstrs.insert(
MI);
1774 Changed |= lowerLiveMaskQueries();
1775 Changed |= lowerCopyInstrs();
1777 if (!HasWaveModes) {
1779 Changed |= lowerKillInstrs(
false);
1780 }
else if (GlobalFlags == StateWQM) {
1784 LIS->InsertMachineInstrInMaps(*
MI);
1785 lowerKillInstrs(
true);
1789 if (GlobalFlags & StateWQM)
1793 processBlock(*BII.first, BII.second, BII.first == &Entry);
1796 lowerBlock(*BII.first, BII.second);
1801 if (LiveMaskReg != Exec)
1802 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1807 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1810 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1811 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1816bool SIWholeQuadModeLegacy::runOnMachineFunction(
MachineFunction &MF) {
1817 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1818 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1821 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1823 PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1824 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1825 return Impl.run(MF);
1838 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1839 bool Changed = Impl.run(MF);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
Analysis containing CSE Info
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
DenseMap< Block *, BlockRelaxAux > Blocks
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an Operation in the Expression.
Core dominator tree base class.
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LiveInterval - This class represents the liveness of a register, or stack slot.
Result of a LiveRange query.
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
pred_iterator pred_begin()
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual MachineFunctionProperties getClearedProperties() const
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex - An opaque wrapper around machine indexes.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
A Use represents the edge between a Value definition and its users.
VNInfo - Value Number Information.
LLVM Value Representation.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionPass * createSIWholeQuadModeLegacyPass()
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()
This represents a simple continuous liveness interval for a value.