22#define DEBUG_TYPE "machine-scheduler"
126 case NoCand:
return "NOCAND";
128 case Latency:
return "LATENCY";
130 case Depth:
return "DEPTH";
143 if (TryVal < CandVal) {
147 if (TryVal > CandVal) {
160 if (TryVal > CandVal) {
164 if (TryVal < CandVal) {
177 NodeNum2Index[SU->
NodeNum] = SUnits.size();
178 SUnits.push_back(SU);
182void SIScheduleBlock::traceCandidate(
const SISchedCandidate &Cand) {
189void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
190 SISchedCandidate &TryCand) {
192 if (!Cand.isValid()) {
197 if (Cand.SGPRUsage > 60 &&
218 Cand.HasLowLatencyNonWaitedParent,
226 if (TryCand.IsLowLatency &&
236 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
241SUnit* SIScheduleBlock::pickNode() {
242 SISchedCandidate TopCand;
244 for (
SUnit* SU : TopReadySUs) {
245 SISchedCandidate TryCand;
246 std::vector<unsigned> pressure;
247 std::vector<unsigned> MaxPressure;
251 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
252 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
255 TryCand.HasLowLatencyNonWaitedParent =
256 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
257 tryCandidateTopDown(TopCand, TryCand);
258 if (TryCand.Reason !=
NoCand)
259 TopCand.setBest(TryCand);
272 for (
SUnit* SU : SUnits) {
273 if (!SU->NumPredsLeft)
274 TopReadySUs.push_back(SU);
277 while (!TopReadySUs.empty()) {
278 SUnit *SU = TopReadySUs[0];
279 ScheduledSUnits.push_back(SU);
291 if (
MI.isDebugValue())
294 if (InstSlot >=
First && InstSlot <=
Last)
312 for (
SUnit* SU : ScheduledSUnits) {
313 RPTracker.setPos(SU->getInstr());
318 RPTracker.closeRegion();
321 TopRPTracker.
addLiveRegs(RPTracker.getPressure().LiveInRegs);
322 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
325 for (
const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
326 if (RegMaskPair.RegUnit.isVirtual())
327 LiveInRegs.insert(RegMaskPair.RegUnit);
352 for (
const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
354 if (
Reg.isVirtual() &&
358 LiveOutRegs.insert(Reg);
379 initRegPressure(BeginBlock, EndBlock);
386 for (
SUnit* SU : SUnits) {
387 if (!SU->NumPredsLeft)
388 TopReadySUs.push_back(SU);
391 while (!TopReadySUs.empty()) {
392 SUnit *SU = pickNode();
393 ScheduledSUnits.push_back(SU);
400 InternalAdditionalPressure.resize(TopPressure.
MaxSetPressure.size());
404 assert(SUnits.size() == ScheduledSUnits.size() &&
405 TopReadySUs.empty());
406 for (
SUnit* SU : SUnits) {
408 SU->NumPredsLeft == 0);
415void SIScheduleBlock::undoSchedule() {
416 for (
SUnit* SU : SUnits) {
417 SU->isScheduled =
false;
418 for (
SDep& Succ : SU->Succs) {
420 undoReleaseSucc(SU, &Succ);
423 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
424 ScheduledSUnits.clear();
428void SIScheduleBlock::undoReleaseSucc(
SUnit *SU,
SDep *SuccEdge) {
438void SIScheduleBlock::releaseSucc(
SUnit *SU,
SDep *SuccEdge) {
447 dbgs() <<
"*** Scheduling failed! ***\n";
449 dbgs() <<
" has been released too many times!\n";
458void SIScheduleBlock::releaseSuccessors(
SUnit *SU,
bool InOrOutBlock) {
468 releaseSucc(SU, &Succ);
470 TopReadySUs.push_back(SuccSU);
474void SIScheduleBlock::nodeScheduled(
SUnit *SU) {
477 std::vector<SUnit *>::iterator
I =
llvm::find(TopReadySUs, SU);
478 if (
I == TopReadySUs.end()) {
479 dbgs() <<
"Data Structure Bug in SI Scheduler\n";
482 TopReadySUs.erase(
I);
484 releaseSuccessors(SU,
true);
487 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->
NodeNum]])
488 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
492 std::map<unsigned, unsigned>::iterator
I =
494 if (
I != NodeNum2Index.end())
495 HasLowLatencyNonWaitedParent[
I->second] = 1;
503 for (
SUnit* SU : SUnits) {
504 releaseSuccessors(SU,
false);
506 HighLatencyBlock =
true;
508 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
513 unsigned PredID = Pred->
getID();
517 if (PredID ==
P->getID())
520 Preds.push_back(Pred);
525 return PredID == S.first->getID();
527 "Loop in the Block Graph!");
532 unsigned SuccID = Succ->
getID();
535 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
536 if (SuccID == S.first->getID()) {
544 ++NumHighLatencySuccessors;
545 Succs.emplace_back(Succ, Kind);
549 "Loop in the Block Graph!");
554 dbgs() <<
"Block (" <<
ID <<
")\n";
558 dbgs() <<
"\nContains High Latency Instruction: "
559 << HighLatencyBlock <<
'\n';
560 dbgs() <<
"\nDepends On:\n";
562 P->printDebug(
false);
565 dbgs() <<
"\nSuccessors:\n";
566 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
568 dbgs() <<
"(Data Dep) ";
569 S.first->printDebug(
false);
573 dbgs() <<
"LiveInPressure "
574 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] <<
' '
575 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] <<
'\n';
576 dbgs() <<
"LiveOutPressure "
577 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] <<
' '
578 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] <<
"\n\n";
579 dbgs() <<
"LiveIns:\n";
583 dbgs() <<
"\nLiveOuts:\n";
588 dbgs() <<
"\nInstructions:\n";
589 for (
const SUnit* SU : SUnits)
592 dbgs() <<
"///////////////////////\n";
603 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator
B =
604 Blocks.find(BlockVariant);
605 if (
B == Blocks.end()) {
607 createBlocksForVariant(BlockVariant);
609 scheduleInsideBlocks();
611 Res.
Blocks = CurrentBlocks;
614 Blocks[BlockVariant] = Res;
623 return CurrentBlocks[Node2CurrentBlock[SU->
NodeNum]]->getID() ==
ID;
626void SIScheduleBlockCreator::colorHighLatenciesAlone() {
627 unsigned DAGSize = DAG->
SUnits.size();
629 for (
unsigned i = 0, e = DAGSize; i != e; ++i) {
632 CurrentColoring[SU->
NodeNum] = NextReservedID++;
639 for (
const auto &PredDep : SU.
Preds) {
640 if (PredDep.getSUnit() == &FromSU &&
647void SIScheduleBlockCreator::colorHighLatenciesGroups() {
648 unsigned DAGSize = DAG->
SUnits.size();
649 unsigned NumHighLatencies = 0;
651 int Color = NextReservedID;
653 std::set<unsigned> FormingGroup;
655 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
661 if (NumHighLatencies == 0)
664 if (NumHighLatencies <= 6)
666 else if (NumHighLatencies <= 12)
674 unsigned CompatibleGroup =
true;
675 int ProposedColor = Color;
676 std::vector<int> AdditionalElements;
688 for (
unsigned j : FormingGroup) {
690 std::vector<int> SubGraph;
703 if (SubGraph.size() > 5) {
705 CompatibleGroup =
false;
709 for (
unsigned k : SubGraph) {
715 CurrentColoring[k] != 0)) {
716 CompatibleGroup =
false;
722 CompatibleGroup =
false;
726 if (!CompatibleGroup)
730 CompatibleGroup =
false;
740 if (CompatibleGroup) {
741 FormingGroup.insert(SU.
NodeNum);
742 for (
unsigned j : AdditionalElements)
743 CurrentColoring[
j] = ProposedColor;
744 CurrentColoring[SU.
NodeNum] = ProposedColor;
750 if (!CompatibleGroup) {
751 FormingGroup.clear();
752 Color = ++NextReservedID;
753 ProposedColor = Color;
754 FormingGroup.insert(SU.
NodeNum);
755 CurrentColoring[SU.
NodeNum] = ProposedColor;
757 }
else if (Count == GroupSize) {
758 FormingGroup.clear();
759 Color = ++NextReservedID;
760 ProposedColor = Color;
767void SIScheduleBlockCreator::colorComputeReservedDependencies() {
768 unsigned DAGSize = DAG->
SUnits.size();
769 std::map<std::set<unsigned>,
unsigned> ColorCombinations;
771 CurrentTopDownReservedDependencyColoring.clear();
772 CurrentBottomUpReservedDependencyColoring.clear();
774 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
775 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
782 std::set<unsigned> SUColors;
785 if (CurrentColoring[SU->
NodeNum]) {
786 CurrentTopDownReservedDependencyColoring[SU->
NodeNum] =
795 if (CurrentTopDownReservedDependencyColoring[Pred->
NodeNum] > 0)
796 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->
NodeNum]);
799 if (SUColors.empty())
802 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
803 CurrentTopDownReservedDependencyColoring[SU->
NodeNum] =
807 ColorCombinations.try_emplace(SUColors, NextNonReservedID);
810 CurrentTopDownReservedDependencyColoring[SU->
NodeNum] = Pos->second;
814 ColorCombinations.clear();
820 std::set<unsigned> SUColors;
823 if (CurrentColoring[SU->
NodeNum]) {
824 CurrentBottomUpReservedDependencyColoring[SU->
NodeNum] =
833 if (CurrentBottomUpReservedDependencyColoring[Succ->
NodeNum] > 0)
834 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->
NodeNum]);
837 if (SUColors.empty())
840 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
841 CurrentBottomUpReservedDependencyColoring[SU->
NodeNum] =
844 std::map<std::set<unsigned>,
unsigned>::iterator Pos =
845 ColorCombinations.find(SUColors);
846 if (Pos != ColorCombinations.end()) {
847 CurrentBottomUpReservedDependencyColoring[SU->
NodeNum] = Pos->second;
849 CurrentBottomUpReservedDependencyColoring[SU->
NodeNum] =
851 ColorCombinations[SUColors] = NextNonReservedID++;
857void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
858 std::map<std::pair<unsigned, unsigned>,
unsigned> ColorCombinations;
864 std::pair<unsigned, unsigned> SUColors;
867 if (CurrentColoring[SU.
NodeNum])
870 SUColors.first = CurrentTopDownReservedDependencyColoring[SU.
NodeNum];
871 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.
NodeNum];
874 ColorCombinations.try_emplace(SUColors, NextNonReservedID);
875 CurrentColoring[SU.
NodeNum] = Pos->second;
881void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
882 unsigned DAGSize = DAG->
SUnits.size();
883 std::vector<int> PendingColoring = CurrentColoring;
886 CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
887 CurrentTopDownReservedDependencyColoring.size() == DAGSize);
896 std::set<unsigned> SUColors;
897 std::set<unsigned> SUColorsPending;
899 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
902 if (CurrentBottomUpReservedDependencyColoring[SU->
NodeNum] > 0 ||
903 CurrentTopDownReservedDependencyColoring[SU->
NodeNum] > 0)
910 if (CurrentBottomUpReservedDependencyColoring[Succ->
NodeNum] > 0 ||
911 CurrentTopDownReservedDependencyColoring[Succ->
NodeNum] > 0)
912 SUColors.insert(CurrentColoring[Succ->
NodeNum]);
913 SUColorsPending.insert(PendingColoring[Succ->
NodeNum]);
918 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
919 PendingColoring[SU->
NodeNum] = *SUColors.begin();
922 PendingColoring[SU->
NodeNum] = NextNonReservedID++;
924 CurrentColoring = PendingColoring;
928void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
929 unsigned DAGSize = DAG->
SUnits.size();
930 unsigned PreviousColor;
931 std::set<unsigned> SeenColors;
936 PreviousColor = CurrentColoring[0];
938 for (
unsigned i = 1, e = DAGSize; i !=
e; ++i) {
940 unsigned CurrentColor = CurrentColoring[i];
941 unsigned PreviousColorSave = PreviousColor;
944 if (CurrentColor != PreviousColor)
945 SeenColors.insert(PreviousColor);
946 PreviousColor = CurrentColor;
948 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
951 if (SeenColors.find(CurrentColor) == SeenColors.end())
954 if (PreviousColorSave != CurrentColor)
955 CurrentColoring[i] = NextNonReservedID++;
957 CurrentColoring[i] = CurrentColoring[i-1];
961void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
962 unsigned DAGSize = DAG->
SUnits.size();
966 std::set<unsigned> SUColors;
968 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
980 SUColors.insert(CurrentColoring[Succ->
NodeNum]);
982 if (SUColors.size() == 1)
983 CurrentColoring[SU->
NodeNum] = *SUColors.begin();
987void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
988 unsigned DAGSize = DAG->
SUnits.size();
992 std::set<unsigned> SUColors;
994 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
1001 SUColors.insert(CurrentColoring[Succ->
NodeNum]);
1003 if (SUColors.size() == 1)
1004 CurrentColoring[SU->
NodeNum] = *SUColors.begin();
1008void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1009 unsigned DAGSize = DAG->
SUnits.size();
1013 std::set<unsigned> SUColors;
1015 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
1022 SUColors.insert(CurrentColoring[Succ->
NodeNum]);
1024 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1025 CurrentColoring[SU->
NodeNum] = *SUColors.begin();
1029void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1030 unsigned DAGSize = DAG->
SUnits.size();
1031 std::map<unsigned, unsigned> ColorCount;
1035 unsigned color = CurrentColoring[SU->
NodeNum];
1036 ++ColorCount[color];
1041 unsigned color = CurrentColoring[SU->
NodeNum];
1042 std::set<unsigned> SUColors;
1044 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
1047 if (ColorCount[color] > 1)
1054 SUColors.insert(CurrentColoring[Succ->
NodeNum]);
1056 if (SUColors.size() == 1 && *SUColors.begin() != color) {
1057 --ColorCount[color];
1058 CurrentColoring[SU->
NodeNum] = *SUColors.begin();
1059 ++ColorCount[*SUColors.begin()];
1064void SIScheduleBlockCreator::cutHugeBlocks() {
1068void SIScheduleBlockCreator::regroupNoUserInstructions() {
1069 unsigned DAGSize = DAG->
SUnits.size();
1070 int GroupID = NextNonReservedID++;
1074 bool hasSuccessor =
false;
1076 if (CurrentColoring[SU->
NodeNum] <= (
int)DAGSize)
1083 hasSuccessor =
true;
1086 CurrentColoring[SU->
NodeNum] = GroupID;
1090void SIScheduleBlockCreator::colorExports() {
1091 unsigned ExportColor = NextNonReservedID++;
1116 "SUnit unexpectedly not representing an instruction!");
1132 for (
unsigned j : ExpGroup)
1133 CurrentColoring[
j] = ExportColor;
1137 unsigned DAGSize = DAG->
SUnits.size();
1138 std::map<unsigned,unsigned> RealID;
1140 CurrentBlocks.clear();
1141 CurrentColoring.clear();
1142 CurrentColoring.resize(DAGSize, 0);
1143 Node2CurrentBlock.clear();
1149 NextNonReservedID = DAGSize + 1;
1154 colorHighLatenciesGroups();
1156 colorHighLatenciesAlone();
1157 colorComputeReservedDependencies();
1158 colorAccordingToReservedDependencies();
1159 colorEndsAccordingToDependencies();
1161 colorForceConsecutiveOrderInGroup();
1162 regroupNoUserInstructions();
1163 colorMergeConstantLoadsNextGroup();
1164 colorMergeIfPossibleNextGroupOnlyForReserved();
1168 Node2CurrentBlock.resize(DAGSize, -1);
1169 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1171 unsigned Color = CurrentColoring[SU->
NodeNum];
1172 auto [It,
Inserted] = RealID.try_emplace(Color);
1174 int ID = CurrentBlocks.size();
1175 BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG,
this,
ID));
1176 CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1179 CurrentBlocks[It->second]->addUnit(SU);
1180 Node2CurrentBlock[SU->
NodeNum] = It->second;
1184 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1186 int SUID = Node2CurrentBlock[i];
1191 if (Node2CurrentBlock[Succ->
NodeNum] != SUID)
1192 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->
NodeNum]],
1199 if (Node2CurrentBlock[Pred->
NodeNum] != SUID)
1200 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->
NodeNum]]);
1206 Block->finalizeUnits();
1208 dbgs() <<
"Blocks created:\n\n";
1210 Block->printDebug(
true);
1220 for (;
I !=
End; ++
I) {
1221 if (!
I->isDebugInstr())
1227void SIScheduleBlockCreator::topologicalSort() {
1228 unsigned DAGSize = CurrentBlocks.size();
1229 std::vector<int> WorkList;
1233 WorkList.reserve(DAGSize);
1234 TopDownIndex2Block.resize(DAGSize);
1235 TopDownBlock2Index.resize(DAGSize);
1236 BottomUpIndex2Block.resize(DAGSize);
1238 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1240 unsigned Degree =
Block->getSuccs().size();
1241 TopDownBlock2Index[i] = Degree;
1243 WorkList.push_back(i);
1248 while (!WorkList.empty()) {
1249 int i = WorkList.back();
1251 WorkList.pop_back();
1252 TopDownBlock2Index[i] = --
Id;
1253 TopDownIndex2Block[
Id] = i;
1255 if (!--TopDownBlock2Index[Pred->getID()])
1256 WorkList.push_back(Pred->getID());
1262 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1265 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1266 "Wrong Top Down topological sorting");
1271 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1272 TopDownIndex2Block.rend());
1275void SIScheduleBlockCreator::scheduleInsideBlocks() {
1276 unsigned DAGSize = CurrentBlocks.size();
1282 LLVM_DEBUG(
dbgs() <<
"First phase: Fast scheduling for Reg Liveness\n");
1283 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1285 Block->fastSchedule();
1293 std::vector<MachineBasicBlock::iterator> PosOld;
1294 std::vector<MachineBasicBlock::iterator> PosNew;
1295 PosOld.reserve(DAG->
SUnits.size());
1296 PosNew.reserve(DAG->
SUnits.size());
1298 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1299 int BlockIndice = TopDownIndex2Block[i];
1301 std::vector<SUnit*> SUs =
Block->getScheduledUnits();
1303 for (
SUnit* SU : SUs) {
1306 PosOld.push_back(Pos);
1307 if (&*CurrentTopFastSched ==
MI) {
1308 PosNew.push_back(Pos);
1309 CurrentTopFastSched =
nextIfDebug(++CurrentTopFastSched,
1321 PosNew.push_back(CurrentTopFastSched);
1330 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1332 std::vector<SUnit*> SUs =
Block->getScheduledUnits();
1333 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1338 for (
unsigned i = PosOld.size(), e = 0; i != e; --i) {
1352 Block->printDebug(
true);
1356void SIScheduleBlockCreator::fillStats() {
1357 unsigned DAGSize = CurrentBlocks.size();
1359 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1360 int BlockIndice = TopDownIndex2Block[i];
1362 if (
Block->getPreds().empty())
1367 if (Depth < Pred->
Depth + Pred->getCost())
1368 Depth = Pred->Depth + Pred->getCost();
1374 for (
unsigned i = 0, e = DAGSize; i !=
e; ++i) {
1375 int BlockIndice = BottomUpIndex2Block[i];
1377 if (
Block->getSuccs().empty())
1380 unsigned Height = 0;
1381 for (
const auto &Succ :
Block->getSuccs())
1382 Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1383 Block->Height = Height;
1393 DAG(DAG), Variant(Variant),
Blocks(BlocksStruct.
Blocks),
1394 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1395 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1407 LiveOutRegsNumUsages.resize(Blocks.size());
1413 std::set<Register> PredOutRegs = Pred->getOutRegs();
1414 std::set<Register>::iterator RegPos = PredOutRegs.find(Reg);
1416 if (RegPos != PredOutRegs.end()) {
1428 ++LiveOutRegsNumUsages[PredID][Reg];
1432 LastPosHighLatencyParentScheduled.resize(
Blocks.size(), 0);
1433 BlockNumPredsLeft.resize(
Blocks.size());
1434 BlockNumSuccsLeft.resize(
Blocks.size());
1436 for (
unsigned i = 0, e =
Blocks.size(); i != e; ++i) {
1438 BlockNumPredsLeft[i] =
Block->getPreds().size();
1439 BlockNumSuccsLeft[i] =
Block->getSuccs().size();
1443 for (
unsigned i = 0, e =
Blocks.size(); i != e; ++i) {
1449 std::set<Register> InRegs = DAG->
getInRegs();
1450 addLiveRegs(InRegs);
1456 for (
unsigned i = 0, e =
Blocks.size(); i != e; ++i) {
1460 const std::set<Register> &OutRegs =
Block->getOutRegs();
1462 if (OutRegs.find(Reg) == OutRegs.end())
1465 ++LiveOutRegsNumUsages[
ID][Reg];
1476 std::set<Register> PredOutRegs = Pred->getOutRegs();
1477 std::set<Register>::iterator RegPos = PredOutRegs.find(Reg);
1479 if (RegPos != PredOutRegs.end()) {
1486 ++LiveRegsConsumers[Reg];
1490 for (
unsigned i = 0, e =
Blocks.size(); i != e; ++i) {
1492 if (BlockNumPredsLeft[i] == 0) {
1493 ReadyBlocks.push_back(
Block);
1498 BlocksScheduled.push_back(
Block);
1499 blockScheduled(
Block);
1503 : BlocksScheduled) {
1508bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1509 SIBlockSchedCandidate &TryCand) {
1510 if (!Cand.isValid()) {
1517 Cand.LastPosHighLatParentScheduled, TryCand, Cand,
Latency))
1524 TryCand, Cand,
Depth))
1527 Cand.NumHighLatencySuccessors,
1533bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1534 SIBlockSchedCandidate &TryCand) {
1535 if (!Cand.isValid()) {
1544 Cand.NumSuccessors > 0,
1556 SIBlockSchedCandidate Cand;
1557 std::vector<SIScheduleBlock*>::iterator Best;
1559 if (ReadyBlocks.empty())
1563 VregCurrentUsage, SregCurrentUsage);
1564 if (VregCurrentUsage > maxVregUsage)
1565 maxVregUsage = VregCurrentUsage;
1566 if (SregCurrentUsage > maxSregUsage)
1567 maxSregUsage = SregCurrentUsage;
1571 dbgs() <<
"\nCurrent Live:\n";
1575 dbgs() <<
"Current VGPRs: " << VregCurrentUsage <<
'\n';
1576 dbgs() <<
"Current SGPRs: " << SregCurrentUsage <<
'\n';);
1578 Cand.Block =
nullptr;
1579 for (std::vector<SIScheduleBlock*>::iterator
I = ReadyBlocks.begin(),
1580 E = ReadyBlocks.end();
I != E; ++
I) {
1581 SIBlockSchedCandidate TryCand;
1583 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1584 TryCand.VGPRUsageDiff =
1585 checkRegUsageImpact(TryCand.Block->getInRegs(),
1586 TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1587 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1588 TryCand.NumHighLatencySuccessors =
1589 TryCand.Block->getNumHighLatencySuccessors();
1590 TryCand.LastPosHighLatParentScheduled =
1591 (
unsigned int) std::max<int> (0,
1592 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1593 LastPosWaitedHighLatency);
1594 TryCand.Height = TryCand.Block->Height;
1596 if (VregCurrentUsage > 120 ||
1598 if (!tryCandidateRegUsage(Cand, TryCand) &&
1600 tryCandidateLatency(Cand, TryCand);
1602 if (!tryCandidateLatency(Cand, TryCand))
1603 tryCandidateRegUsage(Cand, TryCand);
1605 if (TryCand.Reason !=
NoCand) {
1606 Cand.setBest(TryCand);
1608 LLVM_DEBUG(
dbgs() <<
"Best Current Choice: " << Cand.Block->getID() <<
' '
1614 dbgs() <<
"Is a block with high latency instruction: "
1615 << (Cand.IsHighLatency ?
"yes\n" :
"no\n");
1616 dbgs() <<
"Position of last high latency dependency: "
1617 << Cand.LastPosHighLatParentScheduled <<
'\n';
1618 dbgs() <<
"VGPRUsageDiff: " << Cand.VGPRUsageDiff <<
'\n';
1622 ReadyBlocks.erase(Best);
1628void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
1631 if (!
Reg.isVirtual())
1634 (void) LiveRegs.insert(Reg);
1639 std::set<Register> &Regs) {
1642 std::set<Register>::iterator Pos = LiveRegs.find(Reg);
1643 assert (Pos != LiveRegs.end() &&
1644 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1645 LiveRegsConsumers[Reg] >= 1);
1646 --LiveRegsConsumers[
Reg];
1647 if (LiveRegsConsumers[Reg] == 0)
1648 LiveRegs.erase(Pos);
1652void SIScheduleBlockScheduler::releaseBlockSuccs(
SIScheduleBlock *Parent) {
1654 if (--BlockNumPredsLeft[
Block.first->getID()] == 0)
1655 ReadyBlocks.push_back(
Block.first);
1659 LastPosHighLatencyParentScheduled[
Block.first->getID()] = NumBlockScheduled;
1665 addLiveRegs(
Block->getOutRegs());
1666 releaseBlockSuccs(
Block);
1667 for (
const auto &RegP : LiveOutRegsNumUsages[
Block->getID()]) {
1669 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1670 LiveRegsConsumers[RegP.first] == 0);
1671 LiveRegsConsumers[RegP.first] += RegP.second;
1673 if (LastPosHighLatencyParentScheduled[
Block->getID()] >
1674 (
unsigned)LastPosWaitedHighLatency)
1675 LastPosWaitedHighLatency =
1676 LastPosHighLatencyParentScheduled[
Block->getID()];
1677 ++NumBlockScheduled;
1681SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
1682 std::set<Register> &OutRegs) {
1683 std::vector<int> DiffSetPressure;
1688 if (!
Reg.isVirtual())
1690 if (LiveRegsConsumers[Reg] > 1)
1693 for (; PSetI.
isValid(); ++PSetI) {
1694 DiffSetPressure[*PSetI] -= PSetI.
getWeight();
1700 if (!
Reg.isVirtual())
1703 for (; PSetI.
isValid(); ++PSetI) {
1704 DiffSetPressure[*PSetI] += PSetI.
getWeight();
1708 return DiffSetPressure;
1714SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1715 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1718 std::vector<SIScheduleBlock*> ScheduledBlocks;
1721 ScheduledBlocks =
Scheduler.getBlocks();
1724 std::vector<SUnit*>
SUs =
Block->getScheduledUnits();
1748void SIScheduleDAGMI::topologicalSort() {
1761void SIScheduleDAGMI::moveLowLatencies() {
1762 unsigned DAGSize =
SUnits.size();
1763 int LastLowLatencyUser = -1;
1764 int LastLowLatencyPos = -1;
1766 for (
unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1768 bool IsLowLatencyUser =
false;
1769 unsigned MinPos = 0;
1774 IsLowLatencyUser =
true;
1778 unsigned PredPos = ScheduledSUnitsInv[Pred->
NodeNum];
1779 if (PredPos >= MinPos)
1780 MinPos = PredPos + 1;
1784 unsigned BestPos = LastLowLatencyUser + 1;
1785 if ((
int)BestPos <= LastLowLatencyPos)
1786 BestPos = LastLowLatencyPos + 1;
1787 if (BestPos < MinPos)
1790 for (
unsigned u = i;
u > BestPos; --
u) {
1791 ++ScheduledSUnitsInv[ScheduledSUnits[
u-1]];
1792 ScheduledSUnits[
u] = ScheduledSUnits[
u-1];
1794 ScheduledSUnits[BestPos] = SU->
NodeNum;
1795 ScheduledSUnitsInv[SU->
NodeNum] = BestPos;
1797 LastLowLatencyPos = BestPos;
1798 if (IsLowLatencyUser)
1799 LastLowLatencyUser = BestPos;
1800 }
else if (IsLowLatencyUser) {
1801 LastLowLatencyUser = i;
1805 bool CopyForLowLat =
false;
1811 CopyForLowLat =
true;
1817 for (
unsigned u = i;
u > MinPos; --
u) {
1818 ++ScheduledSUnitsInv[ScheduledSUnits[
u-1]];
1819 ScheduledSUnits[
u] = ScheduledSUnits[
u-1];
1821 ScheduledSUnits[MinPos] = SU->
NodeNum;
1822 ScheduledSUnitsInv[SU->
NodeNum] = MinPos;
1829 for (
unsigned i = 0, e =
SUnits.size(); i != e; ++i) {
1830 SUnits[i].isScheduled =
false;
1831 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1832 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1833 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1834 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1839template<
typename _Iterator>
void
1841 unsigned &VgprUsage,
unsigned &SgprUsage) {
1844 for (_Iterator RegI =
First; RegI !=
End; ++RegI) {
1850 for (; PSetI.
isValid(); ++PSetI) {
1851 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1853 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1885 SUnitsLinksBackup =
SUnits;
1894 for (
unsigned i = 0, e = (
unsigned)
SUnits.size(); i != e; ++i) {
1900 bool OffsetIsScalable;
1901 if (SITII->getMemOperandWithOffset(*SU->
getInstr(), BaseLatOp, OffLatReg,
1902 OffsetIsScalable,
TRI))
1927 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1928 Temp =
Scheduler.scheduleVariant(v.first, v.second);
1948 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1949 Temp =
Scheduler.scheduleVariant(v.first, v.second);
1955 ScheduledSUnits = Best.
SUs;
1956 ScheduledSUnitsInv.resize(
SUnits.size());
1958 for (
unsigned i = 0, e = (
unsigned)
SUnits.size(); i != e; ++i) {
1959 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1969 for (
unsigned I : ScheduledSUnits) {
1983 dbgs() <<
"*** Final schedule for "
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DenseMap< Block *, BlockRelaxAux > Blocks
Machine Instruction Scheduler
static MachineBasicBlock::const_iterator nextIfDebug(MachineBasicBlock::const_iterator I, MachineBasicBlock::const_iterator End)
If this iterator is a debug value, increment until reaching the End or a non-debug instruction.
Interface definition for SIInstrInfo.
static const char * getReasonStr(SIScheduleCandReason Reason)
static bool hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU)
static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last, const MachineRegisterInfo *MRI, const LiveIntervals *LIS)
SI Machine Scheduler interface.
GenericScheduler shrinks the unscheduled zone using heuristics to balance the schedule.
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
PSetIterator getPressureSets(Register RegUnit) const
Get an iterator over the pressure sets affected by the given physical or virtual register.
Iterate over the pressure sets affected by the given physical or virtual register.
unsigned getWeight() const
Track the current register pressure at some position in the instruction stream, and remember the high...
void setPos(MachineBasicBlock::const_iterator Pos)
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
MachineBasicBlock::const_iterator getPos() const
Get the MI position corresponding to this register pressure.
LLVM_ABI void closeTop()
Set the boundary for the top of the region and summarize live ins.
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
@ Data
Regular data dependence (aka true-dependence).
bool isWeak() const
Tests if this a weak dependence.
bool isCtrl() const
Shorthand for getKind() != SDep::Data.
static bool isEXP(const MachineInstr &MI)
bool isHighLatencyDef(int Opc) const override
bool isLowLatencyInstruction(const MachineInstr &MI) const
bool isSUInBlock(SUnit *SU, unsigned ID)
SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
SIScheduleBlocks getBlocks(SISchedulerBlockCreatorVariant BlockVariant)
SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, SISchedulerBlockSchedulerVariant Variant, SIScheduleBlocks BlocksStruct)
ArrayRef< std::pair< SIScheduleBlock *, SIScheduleBlockLinkKind > > getSuccs() const
void addPred(SIScheduleBlock *Pred)
void printDebug(bool Full)
void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind)
void schedule(MachineBasicBlock::iterator BeginBlock, MachineBasicBlock::iterator EndBlock)
void addUnit(SUnit *SU)
Functions for Block construction.
bool isHighLatencyBlock()
void restoreSULinksLeft()
std::set< Register > getInRegs()
std::vector< int > BottomUpIndex2SU
std::vector< unsigned > IsHighLatencySU
std::vector< unsigned > LowLatencyOffset
std::vector< int > TopDownIndex2SU
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGTopologicalSort * GetTopo()
void fillVgprSgprCost(_Iterator First, _Iterator End, unsigned &VgprUsage, unsigned &SgprUsage)
MachineRegisterInfo * getMRI()
SIScheduleDAGMI(MachineSchedContext *C)
MachineBasicBlock::iterator getCurrentBottom()
std::vector< unsigned > IsLowLatencySU
MachineBasicBlock::iterator getCurrentTop()
MachineBasicBlock * getBB()
void initRPTracker(RegPressureTracker &RPTracker)
std::set< unsigned > getOutRegs()
~SIScheduleDAGMI() override
const TargetRegisterInfo * getTRI()
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
bool isScheduled
True once scheduled.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
ScheduleDAGTopologicalSort Topo
Topo - A topological ordering for SUnits which permits fast IsReachable and similar queries.
void dumpNode(const SUnit &SU) const override
MachineBasicBlock::iterator begin() const
Returns an iterator to the top of the current scheduling region.
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
void scheduleMI(SUnit *SU, bool IsTopNode)
Move an instruction and update register pressure.
void initQueues(ArrayRef< SUnit * > TopRoots, ArrayRef< SUnit * > BotRoots)
Release ExitSU predecessors and setup scheduler queues.
void buildDAGWithRegPressure()
Call ScheduleDAGInstrs::buildSchedGraph with register pressure tracking enabled.
void dump() const override
RegPressureTracker TopRPTracker
void dumpSchedule() const
dump the scheduled Sequence.
std::unique_ptr< MachineSchedStrategy > SchedImpl
void postProcessDAG()
Apply each ScheduleDAGMutation step in order.
void findRootsAndBiasEdges(SmallVectorImpl< SUnit * > &TopRoots, SmallVectorImpl< SUnit * > &BotRoots)
MachineBasicBlock::iterator CurrentBottom
The bottom of the unscheduled zone.
void viewGraph() override
Out-of-line implementation with no arguments is handy for gdb.
void placeDebugValues()
Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
MachineBasicBlock::iterator CurrentTop
The top of the unscheduled zone.
LLVM_ABI std::vector< int > GetSubGraph(const SUnit &StartSU, const SUnit &TargetSU, bool &Success)
Returns an array of SUs that are both in the successor subtree of StartSU and in the predecessor subt...
LLVM_ABI void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
reverse_iterator rbegin()
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SlotIndex - An opaque wrapper around machine indexes.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
virtual unsigned getNumRegPressureSets() const =0
Get the number of dimensions of register pressure.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
static bool tryGreater(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason)
static bool tryLess(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason)
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
cl::opt< bool > PrintDAGs
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
SISchedulerBlockSchedulerVariant
cl::opt< bool > ViewMISchedDAGs
LLVM_ABI Printable printVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI)
Create Printable object to print virtual registers and physical registers on a raw_ostream.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
SISchedulerBlockCreatorVariant
@ LatenciesAlonePlusConsecutive
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Implement std::hash so that hash_code can be used in STL containers.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.
std::vector< unsigned > SUs
std::vector< int > TopDownIndex2Block
std::vector< SIScheduleBlock * > Blocks
std::vector< int > TopDownBlock2Index
SIScheduleCandReason Reason
void setRepeat(SIScheduleCandReason R)