LLVM 22.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
78#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113
114/// A command line option to turn software pipelining on or off.
115static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
116 cl::desc("Enable Software Pipelining"));
117
118/// A command line option to enable SWP at -Os.
119static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
120 cl::desc("Enable SWP at Os."), cl::Hidden,
121 cl::init(false));
122
123/// A command line argument to limit minimum initial interval for pipelining.
124static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
125 cl::desc("Size limit for the MII."),
126 cl::Hidden, cl::init(27));
127
128/// A command line argument to force pipeliner to use specified initial
129/// interval.
130static cl::opt<int> SwpForceII("pipeliner-force-ii",
131 cl::desc("Force pipeliner to use specified II."),
132 cl::Hidden, cl::init(-1));
133
134/// A command line argument to limit the number of stages in the pipeline.
135static cl::opt<int>
136 SwpMaxStages("pipeliner-max-stages",
137 cl::desc("Maximum stages allowed in the generated scheduled."),
138 cl::Hidden, cl::init(3));
139
140/// A command line option to disable the pruning of chain dependences due to
141/// an unrelated Phi.
142static cl::opt<bool>
143 SwpPruneDeps("pipeliner-prune-deps",
144 cl::desc("Prune dependences between unrelated Phi nodes."),
145 cl::Hidden, cl::init(true));
146
147/// A command line option to disable the pruning of loop carried order
148/// dependences.
149static cl::opt<bool>
150 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
151 cl::desc("Prune loop carried order dependences."),
152 cl::Hidden, cl::init(true));
153
154#ifndef NDEBUG
155static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
156#endif
157
158static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
160 cl::desc("Ignore RecMII"));
161
162static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
163 cl::init(false));
164static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
165 cl::init(false));
166
168 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
169 cl::desc("Instead of emitting the pipelined code, annotate instructions "
170 "with the generated schedule for feeding into the "
171 "-modulo-schedule-test pass"));
172
174 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
175 cl::desc(
176 "Use the experimental peeling code generator for software pipelining"));
177
178static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
179 cl::desc("Range to search for II"),
180 cl::Hidden, cl::init(10));
181
182static cl::opt<bool>
183 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
184 cl::desc("Limit register pressure of scheduled loop"));
185
186static cl::opt<int>
187 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
188 cl::init(5),
189 cl::desc("Margin representing the unused percentage of "
190 "the register pressure limit"));
191
192static cl::opt<bool>
193 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
194 cl::desc("Use the MVE code generator for software pipelining"));
195
196namespace llvm {
197
198// A command line option to enable the CopyToPhi DAG mutation.
200 cl::init(true),
201 cl::desc("Enable CopyToPhi DAG Mutation"));
202
203/// A command line argument to force pipeliner to use specified issue
204/// width.
206 "pipeliner-force-issue-width",
207 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
208 cl::init(-1));
209
210/// A command line argument to set the window scheduling option.
213 cl::desc("Set how to use window scheduling algorithm."),
215 "Turn off window algorithm."),
217 "Use window algorithm after SMS algorithm fails."),
219 "Use window algorithm instead of SMS algorithm.")));
220
221} // end namespace llvm
222
223unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
224char MachinePipeliner::ID = 0;
225#ifndef NDEBUG
227#endif
229
231 "Modulo Software Pipelining", false, false)
237 "Modulo Software Pipelining", false, false)
238
239namespace {
240
241/// This class holds an SUnit corresponding to a memory operation and other
242/// information related to the instruction.
246
247 /// The value of a memory operand.
248 const Value *MemOpValue = nullptr;
249
250 /// The offset of a memory operand.
251 int64_t MemOpOffset = 0;
252
254
255 /// True if all the underlying objects are identified.
256 bool IsAllIdentified = false;
257
259
260 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
261
262 bool isUnknown() const { return MemOpValue == nullptr; }
263
264private:
266};
267
268/// Add loop-carried chain dependencies. This class handles the same type of
269/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
270/// account dependencies across iterations.
272 // Type of instruction that is relevant to order-dependencies
273 enum class InstrTag {
274 Barrier = 0, ///< A barrier event instruction.
275 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
276 ///< not a barrier event.
277 FPExceptions = 2, ///< An instruction that does not match above, but may
278 ///< raise floatin-point exceptions.
279 };
280
281 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
282 TaggedSUnit(SUnit *SU, InstrTag Tag)
283 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
284
285 InstrTag getTag() const { return InstrTag(getInt()); }
286 };
287
288 /// Holds loads and stores with memory related information.
289 struct LoadStoreChunk {
292
293 void append(SUnit *SU);
294 };
295
297 BatchAAResults *BAA;
298 std::vector<SUnit> &SUnits;
299
300 /// The size of SUnits, for convenience.
301 const unsigned N;
302
303 /// Loop-carried Edges.
304 std::vector<BitVector> LoopCarried;
305
306 /// Instructions related to chain dependencies. They are one of the
307 /// following:
308 ///
309 /// 1. Barrier event.
310 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
311 /// value.
312 /// 3. Store, but not a barrier event.
313 /// 4. None of them, but may raise floating-point exceptions.
314 ///
315 /// This is used when analyzing loop-carried dependencies that access global
316 /// barrier instructions.
317 std::vector<TaggedSUnit> TaggedSUnits;
318
319 const TargetInstrInfo *TII = nullptr;
320 const TargetRegisterInfo *TRI = nullptr;
321
322public:
324 const TargetInstrInfo *TII,
325 const TargetRegisterInfo *TRI);
326
327 /// The main function to compute loop-carried order-dependencies.
328 void computeDependencies();
329
330 const BitVector &getLoopCarried(unsigned Idx) const {
331 return LoopCarried[Idx];
332 }
333
334private:
335 /// Tags to \p SU if the instruction may affect the order-dependencies.
336 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
337
338 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
339 const LoadStoreChunk &To);
340
341 /// Add a loop-carried order dependency between \p Src and \p Dst if we
342 /// cannot prove they are independent. When \p PerformCheapCheck is true, a
343 /// lightweight dependency test (referred to as "cheap check" below) is
344 /// performed at first. Note that the cheap check is retained to maintain the
345 /// existing behavior and not expected to be used anymore.
346 ///
347 /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
348 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
349 const SUnitWithMemInfo &Dst,
350 bool PerformCheapCheck = false);
351
352 void computeDependenciesAux();
353};
354
355} // end anonymous namespace
356
357/// The "main" function for implementing Swing Modulo Scheduling.
359 if (skipFunction(mf.getFunction()))
360 return false;
361
362 if (!EnableSWP)
363 return false;
364
365 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
367 return false;
368
370 return false;
371
372 // Cannot pipeline loops without instruction itineraries if we are using
373 // DFA for the pipeliner.
374 if (mf.getSubtarget().useDFAforSMS() &&
377 return false;
378
379 MF = &mf;
380 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
381 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
382 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
385
386 for (const auto &L : *MLI)
387 scheduleLoop(*L);
388
389 return false;
390}
391
392/// Attempt to perform the SMS algorithm on the specified loop. This function is
393/// the main entry point for the algorithm. The function identifies candidate
394/// loops, calculates the minimum initiation interval, and attempts to schedule
395/// the loop.
396bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
397 bool Changed = false;
398 for (const auto &InnerLoop : L)
399 Changed |= scheduleLoop(*InnerLoop);
400
401#ifndef NDEBUG
402 // Stop trying after reaching the limit (if any).
403 int Limit = SwpLoopLimit;
404 if (Limit >= 0) {
405 if (NumTries >= SwpLoopLimit)
406 return Changed;
407 NumTries++;
408 }
409#endif
410
411 setPragmaPipelineOptions(L);
412 if (!canPipelineLoop(L)) {
413 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
414 ORE->emit([&]() {
415 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
416 L.getStartLoc(), L.getHeader())
417 << "Failed to pipeline loop";
418 });
419
420 LI.LoopPipelinerInfo.reset();
421 return Changed;
422 }
423
424 ++NumTrytoPipeline;
425 if (useSwingModuloScheduler())
426 Changed = swingModuloScheduler(L);
427
428 if (useWindowScheduler(Changed))
429 Changed = runWindowScheduler(L);
430
431 LI.LoopPipelinerInfo.reset();
432 return Changed;
433}
434
435void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
436 // Reset the pragma for the next loop in iteration.
437 disabledByPragma = false;
438 II_setByPragma = 0;
439
440 MachineBasicBlock *LBLK = L.getTopBlock();
441
442 if (LBLK == nullptr)
443 return;
444
445 const BasicBlock *BBLK = LBLK->getBasicBlock();
446 if (BBLK == nullptr)
447 return;
448
449 const Instruction *TI = BBLK->getTerminator();
450 if (TI == nullptr)
451 return;
452
453 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
454 if (LoopID == nullptr)
455 return;
456
457 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
458 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
459
460 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
461 MDNode *MD = dyn_cast<MDNode>(MDO);
462
463 if (MD == nullptr)
464 continue;
465
466 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
467
468 if (S == nullptr)
469 continue;
470
471 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
472 assert(MD->getNumOperands() == 2 &&
473 "Pipeline initiation interval hint metadata should have two operands.");
475 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
476 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
477 } else if (S->getString() == "llvm.loop.pipeline.disable") {
478 disabledByPragma = true;
479 }
480 }
481}
482
483/// Return true if the loop can be software pipelined. The algorithm is
484/// restricted to loops with a single basic block. Make sure that the
485/// branch in the loop can be analyzed.
486bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
487 if (L.getNumBlocks() != 1) {
488 ORE->emit([&]() {
489 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
490 L.getStartLoc(), L.getHeader())
491 << "Not a single basic block: "
492 << ore::NV("NumBlocks", L.getNumBlocks());
493 });
494 return false;
495 }
496
497 if (disabledByPragma) {
498 ORE->emit([&]() {
499 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
500 L.getStartLoc(), L.getHeader())
501 << "Disabled by Pragma.";
502 });
503 return false;
504 }
505
506 // Check if the branch can't be understood because we can't do pipelining
507 // if that's the case.
508 LI.TBB = nullptr;
509 LI.FBB = nullptr;
510 LI.BrCond.clear();
511 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
512 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
513 NumFailBranch++;
514 ORE->emit([&]() {
515 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
516 L.getStartLoc(), L.getHeader())
517 << "The branch can't be understood";
518 });
519 return false;
520 }
521
522 LI.LoopInductionVar = nullptr;
523 LI.LoopCompare = nullptr;
525 if (!LI.LoopPipelinerInfo) {
526 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
527 NumFailLoop++;
528 ORE->emit([&]() {
529 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
530 L.getStartLoc(), L.getHeader())
531 << "The loop structure is not supported";
532 });
533 return false;
534 }
535
536 if (!L.getLoopPreheader()) {
537 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
538 NumFailPreheader++;
539 ORE->emit([&]() {
540 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
541 L.getStartLoc(), L.getHeader())
542 << "No loop preheader found";
543 });
544 return false;
545 }
546
547 // Remove any subregisters from inputs to phi nodes.
548 preprocessPhiNodes(*L.getHeader());
549 return true;
550}
551
552void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
554 SlotIndexes &Slots =
555 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
556
557 for (MachineInstr &PI : B.phis()) {
558 MachineOperand &DefOp = PI.getOperand(0);
559 assert(DefOp.getSubReg() == 0);
560 auto *RC = MRI.getRegClass(DefOp.getReg());
561
562 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
563 MachineOperand &RegOp = PI.getOperand(i);
564 if (RegOp.getSubReg() == 0)
565 continue;
566
567 // If the operand uses a subregister, replace it with a new register
568 // without subregisters, and generate a copy to the new register.
569 Register NewReg = MRI.createVirtualRegister(RC);
570 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
572 const DebugLoc &DL = PredB.findDebugLoc(At);
573 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
574 .addReg(RegOp.getReg(), getRegState(RegOp),
575 RegOp.getSubReg());
576 Slots.insertMachineInstrInMaps(*Copy);
577 RegOp.setReg(NewReg);
578 RegOp.setSubReg(0);
579 }
580 }
581}
582
583/// The SMS algorithm consists of the following main steps:
584/// 1. Computation and analysis of the dependence graph.
585/// 2. Ordering of the nodes (instructions).
586/// 3. Attempt to Schedule the loop.
587bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
588 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
589
590 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
592 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
594
595 MachineBasicBlock *MBB = L.getHeader();
596 // The kernel should not include any terminator instructions. These
597 // will be added back later.
598 SMS.startBlock(MBB);
599
600 // Compute the number of 'real' instructions in the basic block by
601 // ignoring terminators.
602 unsigned size = MBB->size();
604 E = MBB->instr_end();
605 I != E; ++I, --size)
606 ;
607
608 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
609 SMS.schedule();
610 SMS.exitRegion();
611
612 SMS.finishBlock();
613 return SMS.hasNewSchedule();
614}
615
625}
626
627bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
629 Context.MF = MF;
630 Context.MLI = MLI;
631 Context.MDT = MDT;
632 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
633 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
634 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
635 Context.RegClassInfo->runOnMachineFunction(*MF);
636 WindowScheduler WS(&Context, L);
637 return WS.run();
638}
639
640bool MachinePipeliner::useSwingModuloScheduler() {
641 // SwingModuloScheduler does not work when WindowScheduler is forced.
643}
644
645bool MachinePipeliner::useWindowScheduler(bool Changed) {
646 // WindowScheduler does not work for following cases:
647 // 1. when it is off.
648 // 2. when SwingModuloScheduler is successfully scheduled.
649 // 3. when pragma II is enabled.
650 if (II_setByPragma) {
651 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
652 "llvm.loop.pipeline.initiationinterval is set.\n");
653 return false;
654 }
655
658}
659
660void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
661 if (SwpForceII > 0)
662 MII = SwpForceII;
663 else if (II_setByPragma > 0)
664 MII = II_setByPragma;
665 else
666 MII = std::max(ResMII, RecMII);
667}
668
669void SwingSchedulerDAG::setMAX_II() {
670 if (SwpForceII > 0)
671 MAX_II = SwpForceII;
672 else if (II_setByPragma > 0)
673 MAX_II = II_setByPragma;
674 else
675 MAX_II = MII + SwpIISearchRange;
676}
677
678/// We override the schedule function in ScheduleDAGInstrs to implement the
679/// scheduling part of the Swing Modulo Scheduling algorithm.
681 buildSchedGraph(AA);
682 const LoopCarriedEdges LCE = addLoopCarriedDependences();
683 updatePhiDependences();
685 changeDependences();
686 postProcessDAG();
687 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
688 LLVM_DEBUG({
689 dump();
690 dbgs() << "===== Loop Carried Edges Begin =====\n";
691 for (SUnit &SU : SUnits)
692 LCE.dump(&SU, TRI, &MRI);
693 dbgs() << "===== Loop Carried Edges End =====\n";
694 });
695
696 NodeSetType NodeSets;
697 findCircuits(NodeSets);
698 NodeSetType Circuits = NodeSets;
699
700 // Calculate the MII.
701 unsigned ResMII = calculateResMII();
702 unsigned RecMII = calculateRecMII(NodeSets);
703
704 fuseRecs(NodeSets);
705
706 // This flag is used for testing and can cause correctness problems.
707 if (SwpIgnoreRecMII)
708 RecMII = 0;
709
710 setMII(ResMII, RecMII);
711 setMAX_II();
712
713 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
714 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
715
716 // Can't schedule a loop without a valid MII.
717 if (MII == 0) {
718 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
719 NumFailZeroMII++;
720 Pass.ORE->emit([&]() {
722 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
723 << "Invalid Minimal Initiation Interval: 0";
724 });
725 return;
726 }
727
728 // Don't pipeline large loops.
729 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
730 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
731 << ", we don't pipeline large loops\n");
732 NumFailLargeMaxMII++;
733 Pass.ORE->emit([&]() {
735 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
736 << "Minimal Initiation Interval too large: "
737 << ore::NV("MII", (int)MII) << " > "
738 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
739 << "Refer to -pipeliner-max-mii.";
740 });
741 return;
742 }
743
744 computeNodeFunctions(NodeSets);
745
746 registerPressureFilter(NodeSets);
747
748 colocateNodeSets(NodeSets);
749
750 checkNodeSets(NodeSets);
751
752 LLVM_DEBUG({
753 for (auto &I : NodeSets) {
754 dbgs() << " Rec NodeSet ";
755 I.dump();
756 }
757 });
758
759 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
760
761 groupRemainingNodes(NodeSets);
762
763 removeDuplicateNodes(NodeSets);
764
765 LLVM_DEBUG({
766 for (auto &I : NodeSets) {
767 dbgs() << " NodeSet ";
768 I.dump();
769 }
770 });
771
772 computeNodeOrder(NodeSets);
773
774 // check for node order issues
775 checkValidNodeOrder(Circuits);
776
777 SMSchedule Schedule(Pass.MF, this);
778 Scheduled = schedulePipeline(Schedule);
779
780 if (!Scheduled){
781 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
782 NumFailNoSchedule++;
783 Pass.ORE->emit([&]() {
785 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
786 << "Unable to find schedule";
787 });
788 return;
789 }
790
791 unsigned numStages = Schedule.getMaxStageCount();
792 // No need to generate pipeline if there are no overlapped iterations.
793 if (numStages == 0) {
794 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
795 NumFailZeroStage++;
796 Pass.ORE->emit([&]() {
798 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
799 << "No need to pipeline - no overlapped iterations in schedule.";
800 });
801 return;
802 }
803 // Check that the maximum stage count is less than user-defined limit.
804 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
805 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
806 << " : too many stages, abort\n");
807 NumFailLargeMaxStage++;
808 Pass.ORE->emit([&]() {
810 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
811 << "Too many stages in schedule: "
812 << ore::NV("numStages", (int)numStages) << " > "
813 << ore::NV("SwpMaxStages", SwpMaxStages)
814 << ". Refer to -pipeliner-max-stages.";
815 });
816 return;
817 }
818
819 Pass.ORE->emit([&]() {
821 Loop.getHeader())
822 << "Pipelined succesfully!";
823 });
824
825 // Generate the schedule as a ModuloSchedule.
826 DenseMap<MachineInstr *, int> Cycles, Stages;
827 std::vector<MachineInstr *> OrderedInsts;
828 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
829 ++Cycle) {
830 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
831 OrderedInsts.push_back(SU->getInstr());
832 Cycles[SU->getInstr()] = Cycle;
833 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
834 }
835 }
837 for (auto &KV : NewMIs) {
838 Cycles[KV.first] = Cycles[KV.second];
839 Stages[KV.first] = Stages[KV.second];
840 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
841 }
842
843 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
844 std::move(Stages));
846 assert(NewInstrChanges.empty() &&
847 "Cannot serialize a schedule with InstrChanges!");
849 MSTI.annotate();
850 return;
851 }
852 // The experimental code generator can't work if there are InstChanges.
853 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
854 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
855 MSE.expand();
856 } else if (MVECodeGen && NewInstrChanges.empty() &&
857 LoopPipelinerInfo->isMVEExpanderSupported() &&
859 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
860 MSE.expand();
861 } else {
862 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
863 MSE.expand();
864 MSE.cleanup();
865 }
866 ++NumPipelined;
867}
868
869/// Clean up after the software pipeliner runs.
871 for (auto &KV : NewMIs)
872 MF.deleteMachineInstr(KV.second);
873 NewMIs.clear();
874
875 // Call the superclass.
877}
878
879/// Return the register values for the operands of a Phi instruction.
880/// This function assume the instruction is a Phi.
882 Register &InitVal, Register &LoopVal) {
883 assert(Phi.isPHI() && "Expecting a Phi.");
884
885 InitVal = Register();
886 LoopVal = Register();
887 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
888 if (Phi.getOperand(i + 1).getMBB() != Loop)
889 InitVal = Phi.getOperand(i).getReg();
890 else
891 LoopVal = Phi.getOperand(i).getReg();
892
893 assert(InitVal && LoopVal && "Unexpected Phi structure.");
894}
895
896/// Return the Phi register value that comes the loop block.
898 const MachineBasicBlock *LoopBB) {
899 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
900 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
901 return Phi.getOperand(i).getReg();
902 return Register();
903}
904
905/// Return true if SUb can be reached from SUa following the chain edges.
906static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
909 Worklist.push_back(SUa);
910 while (!Worklist.empty()) {
911 const SUnit *SU = Worklist.pop_back_val();
912 for (const auto &SI : SU->Succs) {
913 SUnit *SuccSU = SI.getSUnit();
914 if (SI.getKind() == SDep::Order) {
915 if (Visited.count(SuccSU))
916 continue;
917 if (SuccSU == SUb)
918 return true;
919 Worklist.push_back(SuccSU);
920 Visited.insert(SuccSU);
921 }
922 }
923 }
924 return false;
925}
926
927SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
928 if (!getUnderlyingObjects())
929 return;
930 for (const Value *Obj : UnderlyingObjs)
931 if (!isIdentifiedObject(Obj)) {
932 IsAllIdentified = false;
933 break;
934 }
935}
936
938 const SUnitWithMemInfo &Other) const {
939 // If all underlying objects are identified objects and there is no overlap
940 // between them, then these two instructions are disjoint.
941 if (!IsAllIdentified || !Other.IsAllIdentified)
942 return false;
943 for (const Value *Obj : UnderlyingObjs)
944 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
945 return false;
946 return true;
947}
948
949/// Collect the underlying objects for the memory references of an instruction.
950/// This function calls the code in ValueTracking, but first checks that the
951/// instruction has a memory operand.
952/// Returns false if we cannot find the underlying objects.
953bool SUnitWithMemInfo::getUnderlyingObjects() {
954 const MachineInstr *MI = SU->getInstr();
955 if (!MI->hasOneMemOperand())
956 return false;
957 MachineMemOperand *MM = *MI->memoperands_begin();
958 if (!MM->getValue())
959 return false;
960 MemOpValue = MM->getValue();
961 MemOpOffset = MM->getOffset();
963
964 // TODO: A no alias scope may be valid only in a single iteration. In this
965 // case we need to peel off it like LoopAccessAnalysis does.
966 AATags = MM->getAAInfo();
967 return true;
968}
969
970/// Returns true if there is a loop-carried order dependency from \p Src to \p
971/// Dst.
972static bool
973hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
975 const TargetRegisterInfo *TRI,
976 const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
977 if (Src.isTriviallyDisjoint(Dst))
978 return false;
979 if (isSuccOrder(Src.SU, Dst.SU))
980 return false;
981
982 MachineInstr &SrcMI = *Src.SU->getInstr();
983 MachineInstr &DstMI = *Dst.SU->getInstr();
984 if (PerformCheapCheck) {
985 // First, perform the cheaper check that compares the base register.
986 // If they are the same and the load offset is less than the store
987 // offset, then mark the dependence as loop carried potentially.
988 //
989 // TODO: This check will be removed.
990 const MachineOperand *BaseOp1, *BaseOp2;
991 int64_t Offset1, Offset2;
992 bool Offset1IsScalable, Offset2IsScalable;
993 if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
994 TRI) &&
995 TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
996 TRI)) {
997 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
998 Offset1IsScalable == Offset2IsScalable &&
999 (int)Offset1 < (int)Offset2) {
1001 "What happened to the chain edge?");
1002 return true;
1003 }
1004 }
1005 }
1006
1007 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1008 return false;
1009
1010 // Second, the more expensive check that uses alias analysis on the
1011 // base registers. If they alias, and the load offset is less than
1012 // the store offset, the mark the dependence as loop carried.
1013 if (Src.isUnknown() || Dst.isUnknown())
1014 return true;
1015 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1016 return true;
1017
1018 if (BAA.isNoAlias(
1019 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1020 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1021 return false;
1022
1023 // AliasAnalysis sometimes gives up on following the underlying
1024 // object. In such a case, separate checks for underlying objects may
1025 // prove that there are no aliases between two accesses.
1026 for (const Value *SrcObj : Src.UnderlyingObjs)
1027 for (const Value *DstObj : Dst.UnderlyingObjs)
1028 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1029 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1030 return true;
1031
1032 return false;
1033}
1034
1035void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1036 const MachineInstr *MI = SU->getInstr();
1037 if (!MI->mayLoadOrStore())
1038 return;
1039 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1040}
1041
1044 const TargetRegisterInfo *TRI)
1045 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1046 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1047
1049 // Traverse all instructions and extract only what we are targetting.
1050 for (auto &SU : SUnits) {
1051 auto Tagged = getInstrTag(&SU);
1052
1053 // This instruction has no loop-carried order-dependencies.
1054 if (!Tagged)
1055 continue;
1056 TaggedSUnits.emplace_back(&SU, *Tagged);
1057 }
1058
1059 computeDependenciesAux();
1060}
1061
1062std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1063LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1064 MachineInstr *MI = SU->getInstr();
1065 if (TII->isGlobalMemoryObject(MI))
1066 return InstrTag::Barrier;
1067
1068 if (MI->mayStore() ||
1069 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1070 return InstrTag::LoadOrStore;
1071
1072 if (MI->mayRaiseFPException())
1073 return InstrTag::FPExceptions;
1074
1075 return std::nullopt;
1076}
1077
1078void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1079 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1080 bool PerformCheapCheck) {
1081 // Avoid self-dependencies.
1082 if (Src.SU == Dst.SU)
1083 return;
1084
1085 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1086 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1087}
1088
1089void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1090 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1091 // Add load-to-store dependencies (WAR).
1092 for (const SUnitWithMemInfo &Src : From.Loads)
1093 for (const SUnitWithMemInfo &Dst : To.Stores)
1094 // Perform a cheap check first if this is a forward dependency.
1095 addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
1096
1097 // Add store-to-load dependencies (RAW).
1098 for (const SUnitWithMemInfo &Src : From.Stores)
1099 for (const SUnitWithMemInfo &Dst : To.Loads)
1100 addDependenciesBetweenSUs(Src, Dst);
1101
1102 // Add store-to-store dependencies (WAW).
1103 for (const SUnitWithMemInfo &Src : From.Stores)
1104 for (const SUnitWithMemInfo &Dst : To.Stores)
1105 addDependenciesBetweenSUs(Src, Dst);
1106}
1107
1108void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1110 for (const auto &TSU : TaggedSUnits) {
1111 InstrTag Tag = TSU.getTag();
1112 SUnit *SU = TSU.getPointer();
1113 switch (Tag) {
1114 case InstrTag::Barrier:
1115 Chunks.emplace_back();
1116 break;
1117 case InstrTag::LoadOrStore:
1118 Chunks.back().append(SU);
1119 break;
1120 case InstrTag::FPExceptions:
1121 // TODO: Handle this properly.
1122 break;
1123 }
1124 }
1125
1126 // Add dependencies between memory operations. If there are one or more
1127 // barrier events between two memory instructions, we don't add a
1128 // loop-carried dependence for them.
1129 for (const LoadStoreChunk &Chunk : Chunks)
1130 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1131
1132 // TODO: If there are multiple barrier instructions, dependencies from the
1133 // last barrier instruction (or load/store below it) to the first barrier
1134 // instruction (or load/store above it).
1135}
1136
1137/// Add a chain edge between a load and store if the store can be an
1138/// alias of the load on a subsequent iteration, i.e., a loop carried
1139/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1140/// but that code doesn't create loop carried dependences.
1141/// TODO: Also compute output-dependencies.
1142LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1143 LoopCarriedEdges LCE;
1144
1145 // Add loop-carried order-dependencies
1146 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1147 LCODTracker.computeDependencies();
1148 for (unsigned I = 0; I != SUnits.size(); I++)
1149 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1150 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1151
1152 LCE.modifySUnits(SUnits, TII);
1153 return LCE;
1154}
1155
1156/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1157/// processes dependences for PHIs. This function adds true dependences
1158/// from a PHI to a use, and a loop carried dependence from the use to the
1159/// PHI. The loop carried dependence is represented as an anti dependence
1160/// edge. This function also removes chain dependences between unrelated
1161/// PHIs.
1162void SwingSchedulerDAG::updatePhiDependences() {
1163 SmallVector<SDep, 4> RemoveDeps;
1164 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1165
1166 // Iterate over each DAG node.
1167 for (SUnit &I : SUnits) {
1168 RemoveDeps.clear();
1169 // Set to true if the instruction has an operand defined by a Phi.
1170 Register HasPhiUse;
1171 Register HasPhiDef;
1172 MachineInstr *MI = I.getInstr();
1173 // Iterate over each operand, and we process the definitions.
1174 for (const MachineOperand &MO : MI->operands()) {
1175 if (!MO.isReg())
1176 continue;
1177 Register Reg = MO.getReg();
1178 if (MO.isDef()) {
1179 // If the register is used by a Phi, then create an anti dependence.
1181 UI = MRI.use_instr_begin(Reg),
1182 UE = MRI.use_instr_end();
1183 UI != UE; ++UI) {
1184 MachineInstr *UseMI = &*UI;
1185 SUnit *SU = getSUnit(UseMI);
1186 if (SU != nullptr && UseMI->isPHI()) {
1187 if (!MI->isPHI()) {
1188 SDep Dep(SU, SDep::Anti, Reg);
1189 Dep.setLatency(1);
1190 I.addPred(Dep);
1191 } else {
1192 HasPhiDef = Reg;
1193 // Add a chain edge to a dependent Phi that isn't an existing
1194 // predecessor.
1195 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1196 I.addPred(SDep(SU, SDep::Barrier));
1197 }
1198 }
1199 }
1200 } else if (MO.isUse()) {
1201 // If the register is defined by a Phi, then create a true dependence.
1202 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1203 if (DefMI == nullptr)
1204 continue;
1205 SUnit *SU = getSUnit(DefMI);
1206 if (SU != nullptr && DefMI->isPHI()) {
1207 if (!MI->isPHI()) {
1208 SDep Dep(SU, SDep::Data, Reg);
1209 Dep.setLatency(0);
1210 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1211 &SchedModel);
1212 I.addPred(Dep);
1213 } else {
1214 HasPhiUse = Reg;
1215 // Add a chain edge to a dependent Phi that isn't an existing
1216 // predecessor.
1217 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1218 I.addPred(SDep(SU, SDep::Barrier));
1219 }
1220 }
1221 }
1222 }
1223 // Remove order dependences from an unrelated Phi.
1224 if (!SwpPruneDeps)
1225 continue;
1226 for (auto &PI : I.Preds) {
1227 MachineInstr *PMI = PI.getSUnit()->getInstr();
1228 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1229 if (I.getInstr()->isPHI()) {
1230 if (PMI->getOperand(0).getReg() == HasPhiUse)
1231 continue;
1232 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1233 continue;
1234 }
1235 RemoveDeps.push_back(PI);
1236 }
1237 }
1238 for (const SDep &D : RemoveDeps)
1239 I.removePred(D);
1240 }
1241}
1242
1243/// Iterate over each DAG node and see if we can change any dependences
1244/// in order to reduce the recurrence MII.
1245void SwingSchedulerDAG::changeDependences() {
1246 // See if an instruction can use a value from the previous iteration.
1247 // If so, we update the base and offset of the instruction and change
1248 // the dependences.
1249 for (SUnit &I : SUnits) {
1250 unsigned BasePos = 0, OffsetPos = 0;
1251 Register NewBase;
1252 int64_t NewOffset = 0;
1253 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1254 NewOffset))
1255 continue;
1256
1257 // Get the MI and SUnit for the instruction that defines the original base.
1258 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1259 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1260 if (!DefMI)
1261 continue;
1262 SUnit *DefSU = getSUnit(DefMI);
1263 if (!DefSU)
1264 continue;
1265 // Get the MI and SUnit for the instruction that defins the new base.
1266 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1267 if (!LastMI)
1268 continue;
1269 SUnit *LastSU = getSUnit(LastMI);
1270 if (!LastSU)
1271 continue;
1272
1273 if (Topo.IsReachable(&I, LastSU))
1274 continue;
1275
1276 // Remove the dependence. The value now depends on a prior iteration.
1278 for (const SDep &P : I.Preds)
1279 if (P.getSUnit() == DefSU)
1280 Deps.push_back(P);
1281 for (const SDep &D : Deps) {
1282 Topo.RemovePred(&I, D.getSUnit());
1283 I.removePred(D);
1284 }
1285 // Remove the chain dependence between the instructions.
1286 Deps.clear();
1287 for (auto &P : LastSU->Preds)
1288 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1289 Deps.push_back(P);
1290 for (const SDep &D : Deps) {
1291 Topo.RemovePred(LastSU, D.getSUnit());
1292 LastSU->removePred(D);
1293 }
1294
1295 // Add a dependence between the new instruction and the instruction
1296 // that defines the new base.
1297 SDep Dep(&I, SDep::Anti, NewBase);
1298 Topo.AddPred(LastSU, &I);
1299 LastSU->addPred(Dep);
1300
1301 // Remember the base and offset information so that we can update the
1302 // instruction during code generation.
1303 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1304 }
1305}
1306
1307/// Create an instruction stream that represents a single iteration and stage of
1308/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1309/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1310/// function is an approximation of SMSchedule::finalizeSchedule with all
1311/// non-const operations removed.
1313 SMSchedule &Schedule,
1314 std::vector<MachineInstr *> &OrderedInsts,
1317
1318 // Move all instructions to the first stage from the later stages.
1319 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1320 ++Cycle) {
1321 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1322 Stage <= LastStage; ++Stage) {
1323 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1324 Cycle + Stage * Schedule.getInitiationInterval()))) {
1325 Instrs[Cycle].push_front(SU);
1326 }
1327 }
1328 }
1329
1330 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1331 ++Cycle) {
1332 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1333 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1334 for (SUnit *SU : CycleInstrs) {
1335 MachineInstr *MI = SU->getInstr();
1336 OrderedInsts.push_back(MI);
1337 Stages[MI] = Schedule.stageScheduled(SU);
1338 }
1339 }
1340}
1341
1342namespace {
1343
1344// FuncUnitSorter - Comparison operator used to sort instructions by
1345// the number of functional unit choices.
1346struct FuncUnitSorter {
1347 const InstrItineraryData *InstrItins;
1348 const MCSubtargetInfo *STI;
1350
1351 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1352 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1353
1354 // Compute the number of functional unit alternatives needed
1355 // at each stage, and take the minimum value. We prioritize the
1356 // instructions by the least number of choices first.
1357 unsigned minFuncUnits(const MachineInstr *Inst,
1358 InstrStage::FuncUnits &F) const {
1359 unsigned SchedClass = Inst->getDesc().getSchedClass();
1360 unsigned min = UINT_MAX;
1361 if (InstrItins && !InstrItins->isEmpty()) {
1362 for (const InstrStage &IS :
1363 make_range(InstrItins->beginStage(SchedClass),
1364 InstrItins->endStage(SchedClass))) {
1365 InstrStage::FuncUnits funcUnits = IS.getUnits();
1366 unsigned numAlternatives = llvm::popcount(funcUnits);
1367 if (numAlternatives < min) {
1368 min = numAlternatives;
1369 F = funcUnits;
1370 }
1371 }
1372 return min;
1373 }
1374 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1375 const MCSchedClassDesc *SCDesc =
1376 STI->getSchedModel().getSchedClassDesc(SchedClass);
1377 if (!SCDesc->isValid())
1378 // No valid Schedule Class Desc for schedClass, should be
1379 // Pseudo/PostRAPseudo
1380 return min;
1381
1382 for (const MCWriteProcResEntry &PRE :
1383 make_range(STI->getWriteProcResBegin(SCDesc),
1384 STI->getWriteProcResEnd(SCDesc))) {
1385 if (!PRE.ReleaseAtCycle)
1386 continue;
1387 const MCProcResourceDesc *ProcResource =
1388 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1389 unsigned NumUnits = ProcResource->NumUnits;
1390 if (NumUnits < min) {
1391 min = NumUnits;
1392 F = PRE.ProcResourceIdx;
1393 }
1394 }
1395 return min;
1396 }
1397 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1398 }
1399
1400 // Compute the critical resources needed by the instruction. This
1401 // function records the functional units needed by instructions that
1402 // must use only one functional unit. We use this as a tie breaker
1403 // for computing the resource MII. The instrutions that require
1404 // the same, highly used, functional unit have high priority.
1405 void calcCriticalResources(MachineInstr &MI) {
1406 unsigned SchedClass = MI.getDesc().getSchedClass();
1407 if (InstrItins && !InstrItins->isEmpty()) {
1408 for (const InstrStage &IS :
1409 make_range(InstrItins->beginStage(SchedClass),
1410 InstrItins->endStage(SchedClass))) {
1411 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1412 if (llvm::popcount(FuncUnits) == 1)
1413 Resources[FuncUnits]++;
1414 }
1415 return;
1416 }
1417 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1418 const MCSchedClassDesc *SCDesc =
1419 STI->getSchedModel().getSchedClassDesc(SchedClass);
1420 if (!SCDesc->isValid())
1421 // No valid Schedule Class Desc for schedClass, should be
1422 // Pseudo/PostRAPseudo
1423 return;
1424
1425 for (const MCWriteProcResEntry &PRE :
1426 make_range(STI->getWriteProcResBegin(SCDesc),
1427 STI->getWriteProcResEnd(SCDesc))) {
1428 if (!PRE.ReleaseAtCycle)
1429 continue;
1430 Resources[PRE.ProcResourceIdx]++;
1431 }
1432 return;
1433 }
1434 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1435 }
1436
1437 /// Return true if IS1 has less priority than IS2.
1438 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1439 InstrStage::FuncUnits F1 = 0, F2 = 0;
1440 unsigned MFUs1 = minFuncUnits(IS1, F1);
1441 unsigned MFUs2 = minFuncUnits(IS2, F2);
1442 if (MFUs1 == MFUs2)
1443 return Resources.lookup(F1) < Resources.lookup(F2);
1444 return MFUs1 > MFUs2;
1445 }
1446};
1447
1448/// Calculate the maximum register pressure of the scheduled instructions stream
1449class HighRegisterPressureDetector {
1450 MachineBasicBlock *OrigMBB;
1451 const MachineRegisterInfo &MRI;
1452 const TargetRegisterInfo *TRI;
1453
1454 const unsigned PSetNum;
1455
1456 // Indexed by PSet ID
1457 // InitSetPressure takes into account the register pressure of live-in
1458 // registers. It's not depend on how the loop is scheduled, so it's enough to
1459 // calculate them once at the beginning.
1460 std::vector<unsigned> InitSetPressure;
1461
1462 // Indexed by PSet ID
1463 // Upper limit for each register pressure set
1464 std::vector<unsigned> PressureSetLimit;
1465
1467
1469
1470public:
1471 using OrderedInstsTy = std::vector<MachineInstr *>;
1472 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1473
1474private:
1475 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1476 if (Pressures.size() == 0) {
1477 dbgs() << "[]";
1478 } else {
1479 char Prefix = '[';
1480 for (unsigned P : Pressures) {
1481 dbgs() << Prefix << P;
1482 Prefix = ' ';
1483 }
1484 dbgs() << ']';
1485 }
1486 }
1487
1488 void dumpPSet(Register Reg) const {
1489 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1490 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1491 ++PSetIter) {
1492 dbgs() << *PSetIter << ' ';
1493 }
1494 dbgs() << '\n';
1495 }
1496
1497 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1498 Register Reg) const {
1499 auto PSetIter = MRI.getPressureSets(Reg);
1500 unsigned Weight = PSetIter.getWeight();
1501 for (; PSetIter.isValid(); ++PSetIter)
1502 Pressure[*PSetIter] += Weight;
1503 }
1504
1505 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1506 Register Reg) const {
1507 auto PSetIter = MRI.getPressureSets(Reg);
1508 unsigned Weight = PSetIter.getWeight();
1509 for (; PSetIter.isValid(); ++PSetIter) {
1510 auto &P = Pressure[*PSetIter];
1511 assert(P >= Weight &&
1512 "register pressure must be greater than or equal weight");
1513 P -= Weight;
1514 }
1515 }
1516
1517 // Return true if Reg is reserved one, for example, stack pointer
1518 bool isReservedRegister(Register Reg) const {
1519 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1520 }
1521
1522 bool isDefinedInThisLoop(Register Reg) const {
1523 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1524 }
1525
1526 // Search for live-in variables. They are factored into the register pressure
1527 // from the begining. Live-in variables used by every iteration should be
1528 // considered as alive throughout the loop. For example, the variable `c` in
1529 // following code. \code
1530 // int c = ...;
1531 // for (int i = 0; i < n; i++)
1532 // a[i] += b[i] + c;
1533 // \endcode
1534 void computeLiveIn() {
1536 for (auto &MI : *OrigMBB) {
1537 if (MI.isDebugInstr())
1538 continue;
1539 for (auto &Use : ROMap[&MI].Uses) {
1540 auto Reg = Use.RegUnit;
1541 // Ignore the variable that appears only on one side of phi instruction
1542 // because it's used only at the first iteration.
1543 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1544 continue;
1545 if (isReservedRegister(Reg))
1546 continue;
1547 if (isDefinedInThisLoop(Reg))
1548 continue;
1549 Used.insert(Reg);
1550 }
1551 }
1552
1553 for (auto LiveIn : Used)
1554 increaseRegisterPressure(InitSetPressure, LiveIn);
1555 }
1556
1557 // Calculate the upper limit of each pressure set
1558 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1559 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1560 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1561 }
1562
1563 // There are two patterns of last-use.
1564 // - by an instruction of the current iteration
1565 // - by a phi instruction of the next iteration (loop carried value)
1566 //
1567 // Furthermore, following two groups of instructions are executed
1568 // simultaneously
1569 // - next iteration's phi instructions in i-th stage
1570 // - current iteration's instructions in i+1-th stage
1571 //
1572 // This function calculates the last-use of each register while taking into
1573 // account the above two patterns.
1574 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1575 Instr2StageTy &Stages) const {
1576 // We treat virtual registers that are defined and used in this loop.
1577 // Following virtual register will be ignored
1578 // - live-in one
1579 // - defined but not used in the loop (potentially live-out)
1580 DenseSet<Register> TargetRegs;
1581 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1582 if (isDefinedInThisLoop(Reg))
1583 TargetRegs.insert(Reg);
1584 };
1585 for (MachineInstr *MI : OrderedInsts) {
1586 if (MI->isPHI()) {
1587 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1588 UpdateTargetRegs(Reg);
1589 } else {
1590 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1591 UpdateTargetRegs(Use.RegUnit);
1592 }
1593 }
1594
1595 const auto InstrScore = [&Stages](MachineInstr *MI) {
1596 return Stages[MI] + MI->isPHI();
1597 };
1598
1600 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1601 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1602 auto Reg = Use.RegUnit;
1603 if (!TargetRegs.contains(Reg))
1604 continue;
1605 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1606 if (!Inserted) {
1607 MachineInstr *Orig = Ite->second;
1608 MachineInstr *New = MI;
1609 if (InstrScore(Orig) < InstrScore(New))
1610 Ite->second = New;
1611 }
1612 }
1613 }
1614
1615 Instr2LastUsesTy LastUses;
1616 for (auto &Entry : LastUseMI)
1617 LastUses[Entry.second].insert(Entry.first);
1618 return LastUses;
1619 }
1620
1621 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1622 // iterations and check the register pressure at the point where all stages
1623 // overlapping.
1624 //
1625 // An example of unrolled loop where #Stage is 4..
1626 // Iter i+0 i+1 i+2 i+3
1627 // ------------------------
1628 // Stage 0
1629 // Stage 1 0
1630 // Stage 2 1 0
1631 // Stage 3 2 1 0 <- All stages overlap
1632 //
1633 std::vector<unsigned>
1634 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1635 Instr2StageTy &Stages,
1636 const unsigned StageCount) const {
1637 using RegSetTy = SmallDenseSet<Register, 16>;
1638
1639 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1640 // manage the liveness of the registers independently by iterations.
1641 SmallVector<RegSetTy> LiveRegSets(StageCount);
1642
1643 auto CurSetPressure = InitSetPressure;
1644 auto MaxSetPressure = InitSetPressure;
1645 auto LastUses = computeLastUses(OrderedInsts, Stages);
1646
1647 LLVM_DEBUG({
1648 dbgs() << "Ordered instructions:\n";
1649 for (MachineInstr *MI : OrderedInsts) {
1650 dbgs() << "Stage " << Stages[MI] << ": ";
1651 MI->dump();
1652 }
1653 });
1654
1655 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1656 Register Reg) {
1657 if (!Reg.isValid() || isReservedRegister(Reg))
1658 return;
1659
1660 bool Inserted = RegSet.insert(Reg).second;
1661 if (!Inserted)
1662 return;
1663
1664 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1665 increaseRegisterPressure(CurSetPressure, Reg);
1666 LLVM_DEBUG(dumpPSet(Reg));
1667 };
1668
1669 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1670 Register Reg) {
1671 if (!Reg.isValid() || isReservedRegister(Reg))
1672 return;
1673
1674 // live-in register
1675 if (!RegSet.contains(Reg))
1676 return;
1677
1678 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1679 RegSet.erase(Reg);
1680 decreaseRegisterPressure(CurSetPressure, Reg);
1681 LLVM_DEBUG(dumpPSet(Reg));
1682 };
1683
1684 for (unsigned I = 0; I < StageCount; I++) {
1685 for (MachineInstr *MI : OrderedInsts) {
1686 const auto Stage = Stages[MI];
1687 if (I < Stage)
1688 continue;
1689
1690 const unsigned Iter = I - Stage;
1691
1692 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1693 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1694
1695 for (auto LastUse : LastUses[MI]) {
1696 if (MI->isPHI()) {
1697 if (Iter != 0)
1698 EraseReg(LiveRegSets[Iter - 1], LastUse);
1699 } else {
1700 EraseReg(LiveRegSets[Iter], LastUse);
1701 }
1702 }
1703
1704 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1705 MaxSetPressure[PSet] =
1706 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1707
1708 LLVM_DEBUG({
1709 dbgs() << "CurSetPressure=";
1710 dumpRegisterPressures(CurSetPressure);
1711 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1712 MI->dump();
1713 });
1714 }
1715 }
1716
1717 return MaxSetPressure;
1718 }
1719
1720public:
1721 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1722 const MachineFunction &MF)
1723 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1724 TRI(MF.getSubtarget().getRegisterInfo()),
1725 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1726 PressureSetLimit(PSetNum, 0) {}
1727
1728 // Used to calculate register pressure, which is independent of loop
1729 // scheduling.
1730 void init(const RegisterClassInfo &RCI) {
1731 for (MachineInstr &MI : *OrigMBB) {
1732 if (MI.isDebugInstr())
1733 continue;
1734 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1735 }
1736
1737 computeLiveIn();
1738 computePressureSetLimit(RCI);
1739 }
1740
1741 // Calculate the maximum register pressures of the loop and check if they
1742 // exceed the limit
1743 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1744 const unsigned MaxStage) const {
1746 "the percentage of the margin must be between 0 to 100");
1747
1748 OrderedInstsTy OrderedInsts;
1749 Instr2StageTy Stages;
1750 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1751 const auto MaxSetPressure =
1752 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1753
1754 LLVM_DEBUG({
1755 dbgs() << "Dump MaxSetPressure:\n";
1756 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1757 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1758 }
1759 dbgs() << '\n';
1760 });
1761
1762 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1763 unsigned Limit = PressureSetLimit[PSet];
1764 unsigned Margin = Limit * RegPressureMargin / 100;
1765 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1766 << " Margin=" << Margin << "\n");
1767 if (Limit < MaxSetPressure[PSet] + Margin) {
1768 LLVM_DEBUG(
1769 dbgs()
1770 << "Rejected the schedule because of too high register pressure\n");
1771 return true;
1772 }
1773 }
1774 return false;
1775 }
1776};
1777
1778} // end anonymous namespace
1779
1780/// Calculate the resource constrained minimum initiation interval for the
1781/// specified loop. We use the DFA to model the resources needed for
1782/// each instruction, and we ignore dependences. A different DFA is created
1783/// for each cycle that is required. When adding a new instruction, we attempt
1784/// to add it to each existing DFA, until a legal space is found. If the
1785/// instruction cannot be reserved in an existing DFA, we create a new one.
1786unsigned SwingSchedulerDAG::calculateResMII() {
1787 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1788 ResourceManager RM(&MF.getSubtarget(), this);
1789 return RM.calculateResMII();
1790}
1791
1792/// Calculate the recurrence-constrainted minimum initiation interval.
1793/// Iterate over each circuit. Compute the delay(c) and distance(c)
1794/// for each circuit. The II needs to satisfy the inequality
1795/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1796/// II that satisfies the inequality, and the RecMII is the maximum
1797/// of those values.
1798unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1799 unsigned RecMII = 0;
1800
1801 for (NodeSet &Nodes : NodeSets) {
1802 if (Nodes.empty())
1803 continue;
1804
1805 unsigned Delay = Nodes.getLatency();
1806 unsigned Distance = 1;
1807
1808 // ii = ceil(delay / distance)
1809 unsigned CurMII = (Delay + Distance - 1) / Distance;
1810 Nodes.setRecMII(CurMII);
1811 if (CurMII > RecMII)
1812 RecMII = CurMII;
1813 }
1814
1815 return RecMII;
1816}
1817
1818/// Create the adjacency structure of the nodes in the graph.
1819void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1820 SwingSchedulerDAG *DAG) {
1821 BitVector Added(SUnits.size());
1822 DenseMap<int, int> OutputDeps;
1823 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1824 Added.reset();
1825 // Add any successor to the adjacency matrix and exclude duplicates.
1826 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1827 // Only create a back-edge on the first and last nodes of a dependence
1828 // chain. This records any chains and adds them later.
1829 if (OE.isOutputDep()) {
1830 int N = OE.getDst()->NodeNum;
1831 int BackEdge = i;
1832 auto Dep = OutputDeps.find(BackEdge);
1833 if (Dep != OutputDeps.end()) {
1834 BackEdge = Dep->second;
1835 OutputDeps.erase(Dep);
1836 }
1837 OutputDeps[N] = BackEdge;
1838 }
1839 // Do not process a boundary node, an artificial node.
1840 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1841 continue;
1842
1843 // This code is retained o preserve previous behavior and prevent
1844 // regression. This condition means that anti-dependnecies within an
1845 // iteration are ignored when searching circuits. Therefore it's natural
1846 // to consider this dependence as well.
1847 // FIXME: Remove this code if it doesn't have significant impact on
1848 // performance.
1849 if (OE.isAntiDep())
1850 continue;
1851
1852 int N = OE.getDst()->NodeNum;
1853 if (!Added.test(N)) {
1854 AdjK[i].push_back(N);
1855 Added.set(N);
1856 }
1857 }
1858 // A chain edge between a store and a load is treated as a back-edge in the
1859 // adjacency matrix.
1860 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1861 SUnit *Src = IE.getSrc();
1862 SUnit *Dst = IE.getDst();
1863 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1864 continue;
1865 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1866 int N = Src->NodeNum;
1867 if (!Added.test(N)) {
1868 AdjK[i].push_back(N);
1869 Added.set(N);
1870 }
1871 }
1872 }
1873 }
1874 // Add back-edges in the adjacency matrix for the output dependences.
1875 for (auto &OD : OutputDeps)
1876 if (!Added.test(OD.second)) {
1877 AdjK[OD.first].push_back(OD.second);
1878 Added.set(OD.second);
1879 }
1880}
1881
1882/// Identify an elementary circuit in the dependence graph starting at the
1883/// specified node.
1884bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1885 const SwingSchedulerDAG *DAG,
1886 bool HasBackedge) {
1887 SUnit *SV = &SUnits[V];
1888 bool F = false;
1889 Stack.insert(SV);
1890 Blocked.set(V);
1891
1892 for (auto W : AdjK[V]) {
1893 if (NumPaths > MaxPaths)
1894 break;
1895 if (W < S)
1896 continue;
1897 if (W == S) {
1898 if (!HasBackedge)
1899 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1900 F = true;
1901 ++NumPaths;
1902 break;
1903 }
1904 if (!Blocked.test(W)) {
1905 if (circuit(W, S, NodeSets, DAG,
1906 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1907 F = true;
1908 }
1909 }
1910
1911 if (F)
1912 unblock(V);
1913 else {
1914 for (auto W : AdjK[V]) {
1915 if (W < S)
1916 continue;
1917 B[W].insert(SV);
1918 }
1919 }
1920 Stack.pop_back();
1921 return F;
1922}
1923
1924/// Unblock a node in the circuit finding algorithm.
1925void SwingSchedulerDAG::Circuits::unblock(int U) {
1926 Blocked.reset(U);
1928 while (!BU.empty()) {
1930 assert(SI != BU.end() && "Invalid B set.");
1931 SUnit *W = *SI;
1932 BU.erase(W);
1933 if (Blocked.test(W->NodeNum))
1934 unblock(W->NodeNum);
1935 }
1936}
1937
1938/// Identify all the elementary circuits in the dependence graph using
1939/// Johnson's circuit algorithm.
1940void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1941 Circuits Cir(SUnits, Topo);
1942 // Create the adjacency structure.
1943 Cir.createAdjacencyStructure(this);
1944 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1945 Cir.reset();
1946 Cir.circuit(I, I, NodeSets, this);
1947 }
1948}
1949
1950// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1951// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1952// additional copies that are needed across iterations. An artificial dependence
1953// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1954
1955// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1956// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1957// PHI-------True-Dep------> USEOfPhi
1958
1959// The mutation creates
1960// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1961
1962// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1963// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1964// late to avoid additional copies across iterations. The possible scheduling
1965// order would be
1966// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1967
1968void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1969 for (SUnit &SU : DAG->SUnits) {
1970 // Find the COPY/REG_SEQUENCE instruction.
1971 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1972 continue;
1973
1974 // Record the loop carried PHIs.
1976 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1978
1979 for (auto &Dep : SU.Preds) {
1980 SUnit *TmpSU = Dep.getSUnit();
1981 MachineInstr *TmpMI = TmpSU->getInstr();
1982 SDep::Kind DepKind = Dep.getKind();
1983 // Save the loop carried PHI.
1984 if (DepKind == SDep::Anti && TmpMI->isPHI())
1985 PHISUs.push_back(TmpSU);
1986 // Save the source of COPY/REG_SEQUENCE.
1987 // If the source has no pre-decessors, we will end up creating cycles.
1988 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1989 SrcSUs.push_back(TmpSU);
1990 }
1991
1992 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1993 continue;
1994
1995 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1996 // SUnit to the container.
1998 // Do not use iterator based loop here as we are updating the container.
1999 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2000 for (auto &Dep : PHISUs[Index]->Succs) {
2001 if (Dep.getKind() != SDep::Data)
2002 continue;
2003
2004 SUnit *TmpSU = Dep.getSUnit();
2005 MachineInstr *TmpMI = TmpSU->getInstr();
2006 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2007 PHISUs.push_back(TmpSU);
2008 continue;
2009 }
2010 UseSUs.push_back(TmpSU);
2011 }
2012 }
2013
2014 if (UseSUs.size() == 0)
2015 continue;
2016
2017 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2018 // Add the artificial dependencies if it does not form a cycle.
2019 for (auto *I : UseSUs) {
2020 for (auto *Src : SrcSUs) {
2021 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2022 Src->addPred(SDep(I, SDep::Artificial));
2023 SDAG->Topo.AddPred(Src, I);
2024 }
2025 }
2026 }
2027 }
2028}
2029
2030/// Compute several functions need to order the nodes for scheduling.
2031/// ASAP - Earliest time to schedule a node.
2032/// ALAP - Latest time to schedule a node.
2033/// MOV - Mobility function, difference between ALAP and ASAP.
2034/// D - Depth of each node.
2035/// H - Height of each node.
2036void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2037 ScheduleInfo.resize(SUnits.size());
2038
2039 LLVM_DEBUG({
2040 for (int I : Topo) {
2041 const SUnit &SU = SUnits[I];
2042 dumpNode(SU);
2043 }
2044 });
2045
2046 int maxASAP = 0;
2047 // Compute ASAP and ZeroLatencyDepth.
2048 for (int I : Topo) {
2049 int asap = 0;
2050 int zeroLatencyDepth = 0;
2051 SUnit *SU = &SUnits[I];
2052 for (const auto &IE : DDG->getInEdges(SU)) {
2053 SUnit *Pred = IE.getSrc();
2054 if (IE.getLatency() == 0)
2055 zeroLatencyDepth =
2056 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2057 if (IE.ignoreDependence(true))
2058 continue;
2059 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2060 IE.getDistance() * MII));
2061 }
2062 maxASAP = std::max(maxASAP, asap);
2063 ScheduleInfo[I].ASAP = asap;
2064 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2065 }
2066
2067 // Compute ALAP, ZeroLatencyHeight, and MOV.
2068 for (int I : llvm::reverse(Topo)) {
2069 int alap = maxASAP;
2070 int zeroLatencyHeight = 0;
2071 SUnit *SU = &SUnits[I];
2072 for (const auto &OE : DDG->getOutEdges(SU)) {
2073 SUnit *Succ = OE.getDst();
2074 if (Succ->isBoundaryNode())
2075 continue;
2076 if (OE.getLatency() == 0)
2077 zeroLatencyHeight =
2078 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2079 if (OE.ignoreDependence(true))
2080 continue;
2081 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2082 OE.getDistance() * MII));
2083 }
2084
2085 ScheduleInfo[I].ALAP = alap;
2086 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2087 }
2088
2089 // After computing the node functions, compute the summary for each node set.
2090 for (NodeSet &I : NodeSets)
2091 I.computeNodeSetInfo(this);
2092
2093 LLVM_DEBUG({
2094 for (unsigned i = 0; i < SUnits.size(); i++) {
2095 dbgs() << "\tNode " << i << ":\n";
2096 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2097 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2098 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2099 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2100 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2101 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2102 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2103 }
2104 });
2105}
2106
2107/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2108/// as the predecessors of the elements of NodeOrder that are not also in
2109/// NodeOrder.
2112 const NodeSet *S = nullptr) {
2113 Preds.clear();
2114
2115 for (SUnit *SU : NodeOrder) {
2116 for (const auto &IE : DDG->getInEdges(SU)) {
2117 SUnit *PredSU = IE.getSrc();
2118 if (S && S->count(PredSU) == 0)
2119 continue;
2120 if (IE.ignoreDependence(true))
2121 continue;
2122 if (NodeOrder.count(PredSU) == 0)
2123 Preds.insert(PredSU);
2124 }
2125
2126 // FIXME: The following loop-carried dependencies may also need to be
2127 // considered.
2128 // - Physical register dependencies (true-dependence and WAW).
2129 // - Memory dependencies.
2130 for (const auto &OE : DDG->getOutEdges(SU)) {
2131 SUnit *SuccSU = OE.getDst();
2132 if (!OE.isAntiDep())
2133 continue;
2134 if (S && S->count(SuccSU) == 0)
2135 continue;
2136 if (NodeOrder.count(SuccSU) == 0)
2137 Preds.insert(SuccSU);
2138 }
2139 }
2140 return !Preds.empty();
2141}
2142
2143/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2144/// as the successors of the elements of NodeOrder that are not also in
2145/// NodeOrder.
2148 const NodeSet *S = nullptr) {
2149 Succs.clear();
2150
2151 for (SUnit *SU : NodeOrder) {
2152 for (const auto &OE : DDG->getOutEdges(SU)) {
2153 SUnit *SuccSU = OE.getDst();
2154 if (S && S->count(SuccSU) == 0)
2155 continue;
2156 if (OE.ignoreDependence(false))
2157 continue;
2158 if (NodeOrder.count(SuccSU) == 0)
2159 Succs.insert(SuccSU);
2160 }
2161
2162 // FIXME: The following loop-carried dependencies may also need to be
2163 // considered.
2164 // - Physical register dependnecies (true-dependnece and WAW).
2165 // - Memory dependencies.
2166 for (const auto &IE : DDG->getInEdges(SU)) {
2167 SUnit *PredSU = IE.getSrc();
2168 if (!IE.isAntiDep())
2169 continue;
2170 if (S && S->count(PredSU) == 0)
2171 continue;
2172 if (NodeOrder.count(PredSU) == 0)
2173 Succs.insert(PredSU);
2174 }
2175 }
2176 return !Succs.empty();
2177}
2178
2179/// Return true if there is a path from the specified node to any of the nodes
2180/// in DestNodes. Keep track and return the nodes in any path.
2181static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2182 SetVector<SUnit *> &DestNodes,
2183 SetVector<SUnit *> &Exclude,
2184 SmallPtrSet<SUnit *, 8> &Visited,
2185 SwingSchedulerDDG *DDG) {
2186 if (Cur->isBoundaryNode())
2187 return false;
2188 if (Exclude.contains(Cur))
2189 return false;
2190 if (DestNodes.contains(Cur))
2191 return true;
2192 if (!Visited.insert(Cur).second)
2193 return Path.contains(Cur);
2194 bool FoundPath = false;
2195 for (const auto &OE : DDG->getOutEdges(Cur))
2196 if (!OE.ignoreDependence(false))
2197 FoundPath |=
2198 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2199 for (const auto &IE : DDG->getInEdges(Cur))
2200 if (IE.isAntiDep() && IE.getDistance() == 0)
2201 FoundPath |=
2202 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2203 if (FoundPath)
2204 Path.insert(Cur);
2205 return FoundPath;
2206}
2207
2208/// Compute the live-out registers for the instructions in a node-set.
2209/// The live-out registers are those that are defined in the node-set,
2210/// but not used. Except for use operands of Phis.
2212 NodeSet &NS) {
2217 for (SUnit *SU : NS) {
2218 const MachineInstr *MI = SU->getInstr();
2219 if (MI->isPHI())
2220 continue;
2221 for (const MachineOperand &MO : MI->all_uses()) {
2222 Register Reg = MO.getReg();
2223 if (Reg.isVirtual())
2224 Uses.insert(Reg);
2225 else if (MRI.isAllocatable(Reg))
2226 Uses.insert_range(TRI->regunits(Reg.asMCReg()));
2227 }
2228 }
2229 for (SUnit *SU : NS)
2230 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2231 if (!MO.isDead()) {
2232 Register Reg = MO.getReg();
2233 if (Reg.isVirtual()) {
2234 if (!Uses.count(Reg))
2235 LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());
2236 } else if (MRI.isAllocatable(Reg)) {
2237 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2238 if (!Uses.count(Unit))
2239 LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());
2240 }
2241 }
2242 RPTracker.addLiveRegs(LiveOutRegs);
2243}
2244
2245/// A heuristic to filter nodes in recurrent node-sets if the register
2246/// pressure of a set is too high.
2247void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2248 for (auto &NS : NodeSets) {
2249 // Skip small node-sets since they won't cause register pressure problems.
2250 if (NS.size() <= 2)
2251 continue;
2252 IntervalPressure RecRegPressure;
2253 RegPressureTracker RecRPTracker(RecRegPressure);
2254 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2255 computeLiveOuts(MF, RecRPTracker, NS);
2256 RecRPTracker.closeBottom();
2257
2258 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2259 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2260 return A->NodeNum > B->NodeNum;
2261 });
2262
2263 for (auto &SU : SUnits) {
2264 // Since we're computing the register pressure for a subset of the
2265 // instructions in a block, we need to set the tracker for each
2266 // instruction in the node-set. The tracker is set to the instruction
2267 // just after the one we're interested in.
2269 RecRPTracker.setPos(std::next(CurInstI));
2270
2271 RegPressureDelta RPDelta;
2272 ArrayRef<PressureChange> CriticalPSets;
2273 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2274 CriticalPSets,
2275 RecRegPressure.MaxSetPressure);
2276 if (RPDelta.Excess.isValid()) {
2277 LLVM_DEBUG(
2278 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2279 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2280 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2281 NS.setExceedPressure(SU);
2282 break;
2283 }
2284 RecRPTracker.recede();
2285 }
2286 }
2287}
2288
2289/// A heuristic to colocate node sets that have the same set of
2290/// successors.
2291void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2292 unsigned Colocate = 0;
2293 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2294 NodeSet &N1 = NodeSets[i];
2296 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2297 continue;
2298 for (int j = i + 1; j < e; ++j) {
2299 NodeSet &N2 = NodeSets[j];
2300 if (N1.compareRecMII(N2) != 0)
2301 continue;
2303 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2304 continue;
2305 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2306 N1.setColocate(++Colocate);
2307 N2.setColocate(Colocate);
2308 break;
2309 }
2310 }
2311 }
2312}
2313
2314/// Check if the existing node-sets are profitable. If not, then ignore the
2315/// recurrent node-sets, and attempt to schedule all nodes together. This is
2316/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2317/// then it's best to try to schedule all instructions together instead of
2318/// starting with the recurrent node-sets.
2319void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2320 // Look for loops with a large MII.
2321 if (MII < 17)
2322 return;
2323 // Check if the node-set contains only a simple add recurrence.
2324 for (auto &NS : NodeSets) {
2325 if (NS.getRecMII() > 2)
2326 return;
2327 if (NS.getMaxDepth() > MII)
2328 return;
2329 }
2330 NodeSets.clear();
2331 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2332}
2333
2334/// Add the nodes that do not belong to a recurrence set into groups
2335/// based upon connected components.
2336void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2337 SetVector<SUnit *> NodesAdded;
2339 // Add the nodes that are on a path between the previous node sets and
2340 // the current node set.
2341 for (NodeSet &I : NodeSets) {
2343 // Add the nodes from the current node set to the previous node set.
2344 if (succ_L(I, N, DDG.get())) {
2346 for (SUnit *NI : N) {
2347 Visited.clear();
2348 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2349 }
2350 if (!Path.empty())
2351 I.insert(Path.begin(), Path.end());
2352 }
2353 // Add the nodes from the previous node set to the current node set.
2354 N.clear();
2355 if (succ_L(NodesAdded, N, DDG.get())) {
2357 for (SUnit *NI : N) {
2358 Visited.clear();
2359 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2360 }
2361 if (!Path.empty())
2362 I.insert(Path.begin(), Path.end());
2363 }
2364 NodesAdded.insert_range(I);
2365 }
2366
2367 // Create a new node set with the connected nodes of any successor of a node
2368 // in a recurrent set.
2369 NodeSet NewSet;
2371 if (succ_L(NodesAdded, N, DDG.get()))
2372 for (SUnit *I : N)
2373 addConnectedNodes(I, NewSet, NodesAdded);
2374 if (!NewSet.empty())
2375 NodeSets.push_back(NewSet);
2376
2377 // Create a new node set with the connected nodes of any predecessor of a node
2378 // in a recurrent set.
2379 NewSet.clear();
2380 if (pred_L(NodesAdded, N, DDG.get()))
2381 for (SUnit *I : N)
2382 addConnectedNodes(I, NewSet, NodesAdded);
2383 if (!NewSet.empty())
2384 NodeSets.push_back(NewSet);
2385
2386 // Create new nodes sets with the connected nodes any remaining node that
2387 // has no predecessor.
2388 for (SUnit &SU : SUnits) {
2389 if (NodesAdded.count(&SU) == 0) {
2390 NewSet.clear();
2391 addConnectedNodes(&SU, NewSet, NodesAdded);
2392 if (!NewSet.empty())
2393 NodeSets.push_back(NewSet);
2394 }
2395 }
2396}
2397
2398/// Add the node to the set, and add all of its connected nodes to the set.
2399void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2400 SetVector<SUnit *> &NodesAdded) {
2401 NewSet.insert(SU);
2402 NodesAdded.insert(SU);
2403 for (auto &OE : DDG->getOutEdges(SU)) {
2404 SUnit *Successor = OE.getDst();
2405 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2406 NodesAdded.count(Successor) == 0)
2407 addConnectedNodes(Successor, NewSet, NodesAdded);
2408 }
2409 for (auto &IE : DDG->getInEdges(SU)) {
2410 SUnit *Predecessor = IE.getSrc();
2411 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2412 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2413 }
2414}
2415
2416/// Return true if Set1 contains elements in Set2. The elements in common
2417/// are returned in a different container.
2418static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2420 Result.clear();
2421 for (SUnit *SU : Set1) {
2422 if (Set2.count(SU) != 0)
2423 Result.insert(SU);
2424 }
2425 return !Result.empty();
2426}
2427
2428/// Merge the recurrence node sets that have the same initial node.
2429void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2430 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2431 ++I) {
2432 NodeSet &NI = *I;
2433 for (NodeSetType::iterator J = I + 1; J != E;) {
2434 NodeSet &NJ = *J;
2435 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2436 if (NJ.compareRecMII(NI) > 0)
2437 NI.setRecMII(NJ.getRecMII());
2438 for (SUnit *SU : *J)
2439 I->insert(SU);
2440 NodeSets.erase(J);
2441 E = NodeSets.end();
2442 } else {
2443 ++J;
2444 }
2445 }
2446 }
2447}
2448
2449/// Remove nodes that have been scheduled in previous NodeSets.
2450void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2451 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2452 ++I)
2453 for (NodeSetType::iterator J = I + 1; J != E;) {
2454 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2455
2456 if (J->empty()) {
2457 NodeSets.erase(J);
2458 E = NodeSets.end();
2459 } else {
2460 ++J;
2461 }
2462 }
2463}
2464
2465/// Compute an ordered list of the dependence graph nodes, which
2466/// indicates the order that the nodes will be scheduled. This is a
2467/// two-level algorithm. First, a partial order is created, which
2468/// consists of a list of sets ordered from highest to lowest priority.
2469void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2471 NodeOrder.clear();
2472
2473 for (auto &Nodes : NodeSets) {
2474 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2475 OrderKind Order;
2477 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2478 R.insert_range(N);
2479 Order = BottomUp;
2480 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2481 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2482 llvm::set_is_subset(N, Nodes)) {
2483 R.insert_range(N);
2484 Order = TopDown;
2485 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2486 } else if (isIntersect(N, Nodes, R)) {
2487 // If some of the successors are in the existing node-set, then use the
2488 // top-down ordering.
2489 Order = TopDown;
2490 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2491 } else if (NodeSets.size() == 1) {
2492 for (const auto &N : Nodes)
2493 if (N->Succs.size() == 0)
2494 R.insert(N);
2495 Order = BottomUp;
2496 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2497 } else {
2498 // Find the node with the highest ASAP.
2499 SUnit *maxASAP = nullptr;
2500 for (SUnit *SU : Nodes) {
2501 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2502 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2503 maxASAP = SU;
2504 }
2505 R.insert(maxASAP);
2506 Order = BottomUp;
2507 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2508 }
2509
2510 while (!R.empty()) {
2511 if (Order == TopDown) {
2512 // Choose the node with the maximum height. If more than one, choose
2513 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2514 // choose the node with the lowest MOV.
2515 while (!R.empty()) {
2516 SUnit *maxHeight = nullptr;
2517 for (SUnit *I : R) {
2518 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2519 maxHeight = I;
2520 else if (getHeight(I) == getHeight(maxHeight) &&
2521 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2522 maxHeight = I;
2523 else if (getHeight(I) == getHeight(maxHeight) &&
2524 getZeroLatencyHeight(I) ==
2525 getZeroLatencyHeight(maxHeight) &&
2526 getMOV(I) < getMOV(maxHeight))
2527 maxHeight = I;
2528 }
2529 NodeOrder.insert(maxHeight);
2530 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2531 R.remove(maxHeight);
2532 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2533 SUnit *SU = OE.getDst();
2534 if (Nodes.count(SU) == 0)
2535 continue;
2536 if (NodeOrder.contains(SU))
2537 continue;
2538 if (OE.ignoreDependence(false))
2539 continue;
2540 R.insert(SU);
2541 }
2542
2543 // FIXME: The following loop-carried dependencies may also need to be
2544 // considered.
2545 // - Physical register dependnecies (true-dependnece and WAW).
2546 // - Memory dependencies.
2547 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2548 SUnit *SU = IE.getSrc();
2549 if (!IE.isAntiDep())
2550 continue;
2551 if (Nodes.count(SU) == 0)
2552 continue;
2553 if (NodeOrder.contains(SU))
2554 continue;
2555 R.insert(SU);
2556 }
2557 }
2558 Order = BottomUp;
2559 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2561 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2562 R.insert_range(N);
2563 } else {
2564 // Choose the node with the maximum depth. If more than one, choose
2565 // the node with the maximum ZeroLatencyDepth. If still more than one,
2566 // choose the node with the lowest MOV.
2567 while (!R.empty()) {
2568 SUnit *maxDepth = nullptr;
2569 for (SUnit *I : R) {
2570 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2571 maxDepth = I;
2572 else if (getDepth(I) == getDepth(maxDepth) &&
2573 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2574 maxDepth = I;
2575 else if (getDepth(I) == getDepth(maxDepth) &&
2576 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2577 getMOV(I) < getMOV(maxDepth))
2578 maxDepth = I;
2579 }
2580 NodeOrder.insert(maxDepth);
2581 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2582 R.remove(maxDepth);
2583 if (Nodes.isExceedSU(maxDepth)) {
2584 Order = TopDown;
2585 R.clear();
2586 R.insert(Nodes.getNode(0));
2587 break;
2588 }
2589 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2590 SUnit *SU = IE.getSrc();
2591 if (Nodes.count(SU) == 0)
2592 continue;
2593 if (NodeOrder.contains(SU))
2594 continue;
2595 R.insert(SU);
2596 }
2597
2598 // FIXME: The following loop-carried dependencies may also need to be
2599 // considered.
2600 // - Physical register dependnecies (true-dependnece and WAW).
2601 // - Memory dependencies.
2602 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2603 SUnit *SU = OE.getDst();
2604 if (!OE.isAntiDep())
2605 continue;
2606 if (Nodes.count(SU) == 0)
2607 continue;
2608 if (NodeOrder.contains(SU))
2609 continue;
2610 R.insert(SU);
2611 }
2612 }
2613 Order = TopDown;
2614 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2616 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2617 R.insert_range(N);
2618 }
2619 }
2620 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2621 }
2622
2623 LLVM_DEBUG({
2624 dbgs() << "Node order: ";
2625 for (SUnit *I : NodeOrder)
2626 dbgs() << " " << I->NodeNum << " ";
2627 dbgs() << "\n";
2628 });
2629}
2630
2631/// Process the nodes in the computed order and create the pipelined schedule
2632/// of the instructions, if possible. Return true if a schedule is found.
2633bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2634
2635 if (NodeOrder.empty()){
2636 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2637 return false;
2638 }
2639
2640 bool scheduleFound = false;
2641 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2642 if (LimitRegPressure) {
2643 HRPDetector =
2644 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2645 HRPDetector->init(RegClassInfo);
2646 }
2647 // Keep increasing II until a valid schedule is found.
2648 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2649 Schedule.reset();
2650 Schedule.setInitiationInterval(II);
2651 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2652
2655 do {
2656 SUnit *SU = *NI;
2657
2658 // Compute the schedule time for the instruction, which is based
2659 // upon the scheduled time for any predecessors/successors.
2660 int EarlyStart = INT_MIN;
2661 int LateStart = INT_MAX;
2662 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2663 LLVM_DEBUG({
2664 dbgs() << "\n";
2665 dbgs() << "Inst (" << SU->NodeNum << ") ";
2666 SU->getInstr()->dump();
2667 dbgs() << "\n";
2668 });
2669 LLVM_DEBUG(
2670 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2671
2672 if (EarlyStart > LateStart)
2673 scheduleFound = false;
2674 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2675 scheduleFound =
2676 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2677 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2678 scheduleFound =
2679 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2680 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2681 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2682 // When scheduling a Phi it is better to start at the late cycle and
2683 // go backwards. The default order may insert the Phi too far away
2684 // from its first dependence.
2685 // Also, do backward search when all scheduled predecessors are
2686 // loop-carried output/order dependencies. Empirically, there are also
2687 // cases where scheduling becomes possible with backward search.
2688 if (SU->getInstr()->isPHI() ||
2689 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2690 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2691 else
2692 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2693 } else {
2694 int FirstCycle = Schedule.getFirstCycle();
2695 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2696 FirstCycle + getASAP(SU) + II - 1, II);
2697 }
2698
2699 // Even if we find a schedule, make sure the schedule doesn't exceed the
2700 // allowable number of stages. We keep trying if this happens.
2701 if (scheduleFound)
2702 if (SwpMaxStages > -1 &&
2703 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2704 scheduleFound = false;
2705
2706 LLVM_DEBUG({
2707 if (!scheduleFound)
2708 dbgs() << "\tCan't schedule\n";
2709 });
2710 } while (++NI != NE && scheduleFound);
2711
2712 // If a schedule is found, validate it against the validation-only
2713 // dependencies.
2714 if (scheduleFound)
2715 scheduleFound = DDG->isValidSchedule(Schedule);
2716
2717 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2718 if (scheduleFound)
2719 scheduleFound =
2720 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2721
2722 // If a schedule is found, check if it is a valid schedule too.
2723 if (scheduleFound)
2724 scheduleFound = Schedule.isValidSchedule(this);
2725
2726 // If a schedule was found and the option is enabled, check if the schedule
2727 // might generate additional register spills/fills.
2728 if (scheduleFound && LimitRegPressure)
2729 scheduleFound =
2730 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2731 }
2732
2733 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2734 << " (II=" << Schedule.getInitiationInterval()
2735 << ")\n");
2736
2737 if (scheduleFound) {
2738 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2739 if (!scheduleFound)
2740 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2741 }
2742
2743 if (scheduleFound) {
2744 Schedule.finalizeSchedule(this);
2745 Pass.ORE->emit([&]() {
2747 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2748 << "Schedule found with Initiation Interval: "
2749 << ore::NV("II", Schedule.getInitiationInterval())
2750 << ", MaxStageCount: "
2751 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2752 });
2753 } else
2754 Schedule.reset();
2755
2756 return scheduleFound && Schedule.getMaxStageCount() > 0;
2757}
2758
2760 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2761 Register Result;
2762 for (const MachineOperand &Use : MI.all_uses()) {
2763 Register Reg = Use.getReg();
2764 if (!Reg.isVirtual())
2765 return Register();
2766 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2767 continue;
2768 if (Result)
2769 return Register();
2770 Result = Reg;
2771 }
2772 return Result;
2773}
2774
2775/// When Op is a value that is incremented recursively in a loop and there is a
2776/// unique instruction that increments it, returns true and sets Value.
2778 if (!Op.isReg() || !Op.getReg().isVirtual())
2779 return false;
2780
2781 Register OrgReg = Op.getReg();
2782 Register CurReg = OrgReg;
2783 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2784 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2785
2786 const TargetInstrInfo *TII =
2787 LoopBB->getParent()->getSubtarget().getInstrInfo();
2788 const TargetRegisterInfo *TRI =
2789 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2790
2791 MachineInstr *Phi = nullptr;
2792 MachineInstr *Increment = nullptr;
2793
2794 // Traverse definitions until it reaches Op or an instruction that does not
2795 // satisfy the condition.
2796 // Acceptable example:
2797 // bb.0:
2798 // %0 = PHI %3, %bb.0, ...
2799 // %2 = ADD %0, Value
2800 // ... = LOAD %2(Op)
2801 // %3 = COPY %2
2802 while (true) {
2803 if (!CurReg.isValid() || !CurReg.isVirtual())
2804 return false;
2805 MachineInstr *Def = MRI.getVRegDef(CurReg);
2806 if (Def->getParent() != LoopBB)
2807 return false;
2808
2809 if (Def->isCopy()) {
2810 // Ignore copy instructions unless they contain subregisters
2811 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2812 return false;
2813 CurReg = Def->getOperand(1).getReg();
2814 } else if (Def->isPHI()) {
2815 // There must be just one Phi
2816 if (Phi)
2817 return false;
2818 Phi = Def;
2819 CurReg = getLoopPhiReg(*Def, LoopBB);
2820 } else if (TII->getIncrementValue(*Def, Value)) {
2821 // Potentially a unique increment
2822 if (Increment)
2823 // Multiple increments exist
2824 return false;
2825
2826 const MachineOperand *BaseOp;
2827 int64_t Offset;
2828 bool OffsetIsScalable;
2829 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2830 TRI)) {
2831 // Pre/post increment instruction
2832 CurReg = BaseOp->getReg();
2833 } else {
2834 // If only one of the operands is defined within the loop, it is assumed
2835 // to be an incremented value.
2836 CurReg = findUniqueOperandDefinedInLoop(*Def);
2837 if (!CurReg.isValid())
2838 return false;
2839 }
2840 Increment = Def;
2841 } else {
2842 return false;
2843 }
2844 if (CurReg == OrgReg)
2845 break;
2846 }
2847
2848 if (!Phi || !Increment)
2849 return false;
2850
2851 return true;
2852}
2853
2854/// Return true if we can compute the amount the instruction changes
2855/// during each iteration. Set Delta to the amount of the change.
2856bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2857 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2858 const MachineOperand *BaseOp;
2859 int64_t Offset;
2860 bool OffsetIsScalable;
2861 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2862 return false;
2863
2864 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2865 if (OffsetIsScalable)
2866 return false;
2867
2868 if (!BaseOp->isReg())
2869 return false;
2870
2871 return findLoopIncrementValue(*BaseOp, Delta);
2872}
2873
2874/// Check if we can change the instruction to use an offset value from the
2875/// previous iteration. If so, return true and set the base and offset values
2876/// so that we can rewrite the load, if necessary.
2877/// v1 = Phi(v0, v3)
2878/// v2 = load v1, 0
2879/// v3 = post_store v1, 4, x
2880/// This function enables the load to be rewritten as v2 = load v3, 4.
2881bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2882 unsigned &BasePos,
2883 unsigned &OffsetPos,
2884 Register &NewBase,
2885 int64_t &Offset) {
2886 // Get the load instruction.
2887 if (TII->isPostIncrement(*MI))
2888 return false;
2889 unsigned BasePosLd, OffsetPosLd;
2890 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2891 return false;
2892 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2893
2894 // Look for the Phi instruction.
2895 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2896 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2897 if (!Phi || !Phi->isPHI())
2898 return false;
2899 // Get the register defined in the loop block.
2900 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2901 if (!PrevReg)
2902 return false;
2903
2904 // Check for the post-increment load/store instruction.
2905 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2906 if (!PrevDef || PrevDef == MI)
2907 return false;
2908
2909 if (!TII->isPostIncrement(*PrevDef))
2910 return false;
2911
2912 unsigned BasePos1 = 0, OffsetPos1 = 0;
2913 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2914 return false;
2915
2916 // Make sure that the instructions do not access the same memory location in
2917 // the next iteration.
2918 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2919 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2920 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2921 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2922 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2923 MF.deleteMachineInstr(NewMI);
2924 if (!Disjoint)
2925 return false;
2926
2927 // Set the return value once we determine that we return true.
2928 BasePos = BasePosLd;
2929 OffsetPos = OffsetPosLd;
2930 NewBase = PrevReg;
2931 Offset = StoreOffset;
2932 return true;
2933}
2934
2935/// Apply changes to the instruction if needed. The changes are need
2936/// to improve the scheduling and depend up on the final schedule.
2938 SMSchedule &Schedule) {
2939 SUnit *SU = getSUnit(MI);
2941 InstrChanges.find(SU);
2942 if (It != InstrChanges.end()) {
2943 std::pair<Register, int64_t> RegAndOffset = It->second;
2944 unsigned BasePos, OffsetPos;
2945 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2946 return;
2947 Register BaseReg = MI->getOperand(BasePos).getReg();
2948 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2949 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2950 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2951 int BaseStageNum = Schedule.stageScheduled(SU);
2952 int BaseCycleNum = Schedule.cycleScheduled(SU);
2953 if (BaseStageNum < DefStageNum) {
2954 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2955 int OffsetDiff = DefStageNum - BaseStageNum;
2956 if (DefCycleNum < BaseCycleNum) {
2957 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2958 if (OffsetDiff > 0)
2959 --OffsetDiff;
2960 }
2961 int64_t NewOffset =
2962 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2963 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2964 SU->setInstr(NewMI);
2965 MISUnitMap[NewMI] = SU;
2966 NewMIs[MI] = NewMI;
2967 }
2968 }
2969}
2970
2971/// Return the instruction in the loop that defines the register.
2972/// If the definition is a Phi, then follow the Phi operand to
2973/// the instruction in the loop.
2974MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2976 MachineInstr *Def = MRI.getVRegDef(Reg);
2977 while (Def->isPHI()) {
2978 if (!Visited.insert(Def).second)
2979 break;
2980 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2981 if (Def->getOperand(i + 1).getMBB() == BB) {
2982 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2983 break;
2984 }
2985 }
2986 return Def;
2987}
2988
2989/// Return false if there is no overlap between the region accessed by BaseMI in
2990/// an iteration and the region accessed by OtherMI in subsequent iterations.
2992 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
2993 int DeltaB, DeltaO, Delta;
2994 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
2995 DeltaB != DeltaO)
2996 return true;
2997 Delta = DeltaB;
2998
2999 const MachineOperand *BaseOpB, *BaseOpO;
3000 int64_t OffsetB, OffsetO;
3001 bool OffsetBIsScalable, OffsetOIsScalable;
3002 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3003 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3004 OffsetBIsScalable, TRI) ||
3005 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3006 OffsetOIsScalable, TRI))
3007 return true;
3008
3009 if (OffsetBIsScalable || OffsetOIsScalable)
3010 return true;
3011
3012 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3013 // Pass cases with different base operands but same initial values.
3014 // Typically for when pre/post increment is used.
3015
3016 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3017 return true;
3018 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3019 if (!RegB.isVirtual() || !RegO.isVirtual())
3020 return true;
3021
3022 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3023 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3024 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3025 return true;
3026
3027 Register InitValB;
3028 Register LoopValB;
3029 Register InitValO;
3030 Register LoopValO;
3031 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3032 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3033 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3034 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3035
3036 if (!InitDefB->isIdenticalTo(*InitDefO))
3037 return true;
3038 }
3039
3040 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3041 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3042
3043 // This is the main test, which checks the offset values and the loop
3044 // increment value to determine if the accesses may be loop carried.
3045 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3046 return true;
3047
3048 LLVM_DEBUG({
3049 dbgs() << "Overlap check:\n";
3050 dbgs() << " BaseMI: ";
3051 BaseMI->dump();
3052 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3053 << ", Len: " << AccessSizeB.getValue() << "\n";
3054 dbgs() << " OtherMI: ";
3055 OtherMI->dump();
3056 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3057 << ", Len: " << AccessSizeO.getValue() << "\n";
3058 });
3059
3060 // Excessive overlap may be detected in strided patterns.
3061 // For example, the memory addresses of the store and the load in
3062 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3063 // are assumed to overlap.
3064 if (Delta < 0) {
3065 int64_t BaseMinAddr = OffsetB;
3066 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3067 if (BaseMinAddr > OhterNextIterMaxAddr) {
3068 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3069 return false;
3070 }
3071 } else {
3072 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3073 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3074 if (BaseMaxAddr < OtherNextIterMinAddr) {
3075 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3076 return false;
3077 }
3078 }
3079 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3080 return true;
3081}
3082
3083/// Return true for an order or output dependence that is loop carried
3084/// potentially. A dependence is loop carried if the destination defines a value
3085/// that may be used or defined by the source in a subsequent iteration.
3087 const SwingSchedulerDDGEdge &Edge) const {
3088 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3089 Edge.getDst()->isBoundaryNode())
3090 return false;
3091
3093 return true;
3094
3095 if (Edge.isOutputDep())
3096 return true;
3097
3098 MachineInstr *SI = Edge.getSrc()->getInstr();
3099 MachineInstr *DI = Edge.getDst()->getInstr();
3100 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3101
3102 // Assume ordered loads and stores may have a loop carried dependence.
3103 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3104 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3105 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3106 return true;
3107
3108 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3109 return false;
3110
3111 // The conservative assumption is that a dependence between memory operations
3112 // may be loop carried. The following code checks when it can be proved that
3113 // there is no loop carried dependence.
3114 return mayOverlapInLaterIter(DI, SI);
3115}
3116
3117void SwingSchedulerDAG::postProcessDAG() {
3118 for (auto &M : Mutations)
3119 M->apply(this);
3120}
3121
3122/// Try to schedule the node at the specified StartCycle and continue
3123/// until the node is schedule or the EndCycle is reached. This function
3124/// returns true if the node is scheduled. This routine may search either
3125/// forward or backward for a place to insert the instruction based upon
3126/// the relative values of StartCycle and EndCycle.
3127bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3128 bool forward = true;
3129 LLVM_DEBUG({
3130 dbgs() << "Trying to insert node between " << StartCycle << " and "
3131 << EndCycle << " II: " << II << "\n";
3132 });
3133 if (StartCycle > EndCycle)
3134 forward = false;
3135
3136 // The terminating condition depends on the direction.
3137 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3138 for (int curCycle = StartCycle; curCycle != termCycle;
3139 forward ? ++curCycle : --curCycle) {
3140
3141 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3142 ProcItinResources.canReserveResources(*SU, curCycle)) {
3143 LLVM_DEBUG({
3144 dbgs() << "\tinsert at cycle " << curCycle << " ";
3145 SU->getInstr()->dump();
3146 });
3147
3148 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3149 ProcItinResources.reserveResources(*SU, curCycle);
3150 ScheduledInstrs[curCycle].push_back(SU);
3151 InstrToCycle.insert(std::make_pair(SU, curCycle));
3152 if (curCycle > LastCycle)
3153 LastCycle = curCycle;
3154 if (curCycle < FirstCycle)
3155 FirstCycle = curCycle;
3156 return true;
3157 }
3158 LLVM_DEBUG({
3159 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3160 SU->getInstr()->dump();
3161 });
3162 }
3163 return false;
3164}
3165
3166// Return the cycle of the earliest scheduled instruction in the chain.
3168 const SwingSchedulerDDG *DDG) {
3171 Worklist.push_back(Dep);
3172 int EarlyCycle = INT_MAX;
3173 while (!Worklist.empty()) {
3174 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3175 SUnit *PrevSU = Cur.getSrc();
3176 if (Visited.count(PrevSU))
3177 continue;
3178 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3179 if (it == InstrToCycle.end())
3180 continue;
3181 EarlyCycle = std::min(EarlyCycle, it->second);
3182 for (const auto &IE : DDG->getInEdges(PrevSU))
3183 if (IE.isOrderDep() || IE.isOutputDep())
3184 Worklist.push_back(IE);
3185 Visited.insert(PrevSU);
3186 }
3187 return EarlyCycle;
3188}
3189
3190// Return the cycle of the latest scheduled instruction in the chain.
3192 const SwingSchedulerDDG *DDG) {
3195 Worklist.push_back(Dep);
3196 int LateCycle = INT_MIN;
3197 while (!Worklist.empty()) {
3198 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3199 SUnit *SuccSU = Cur.getDst();
3200 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3201 continue;
3202 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3203 if (it == InstrToCycle.end())
3204 continue;
3205 LateCycle = std::max(LateCycle, it->second);
3206 for (const auto &OE : DDG->getOutEdges(SuccSU))
3207 if (OE.isOrderDep() || OE.isOutputDep())
3208 Worklist.push_back(OE);
3209 Visited.insert(SuccSU);
3210 }
3211 return LateCycle;
3212}
3213
3214/// If an instruction has a use that spans multiple iterations, then
3215/// return true. These instructions are characterized by having a back-ege
3216/// to a Phi, which contains a reference to another Phi.
3218 for (auto &P : SU->Preds)
3219 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3220 for (auto &S : P.getSUnit()->Succs)
3221 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3222 return P.getSUnit();
3223 return nullptr;
3224}
3225
3226/// Compute the scheduling start slot for the instruction. The start slot
3227/// depends on any predecessor or successor nodes scheduled already.
3228void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3229 int II, SwingSchedulerDAG *DAG) {
3230 const SwingSchedulerDDG *DDG = DAG->getDDG();
3231
3232 // Iterate over each instruction that has been scheduled already. The start
3233 // slot computation depends on whether the previously scheduled instruction
3234 // is a predecessor or successor of the specified instruction.
3235 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3236 for (SUnit *I : getInstructions(cycle)) {
3237 for (const auto &IE : DDG->getInEdges(SU)) {
3238 if (IE.getSrc() == I) {
3239 // FIXME: Add reverse edge to `DDG` instead of calling
3240 // `isLoopCarriedDep`
3241 if (DAG->isLoopCarriedDep(IE)) {
3242 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3243 *MinLateStart = std::min(*MinLateStart, End);
3244 }
3245 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3246 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3247 }
3248 }
3249
3250 for (const auto &OE : DDG->getOutEdges(SU)) {
3251 if (OE.getDst() == I) {
3252 // FIXME: Add reverse edge to `DDG` instead of calling
3253 // `isLoopCarriedDep`
3254 if (DAG->isLoopCarriedDep(OE)) {
3255 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3256 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3257 }
3258 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3259 *MinLateStart = std::min(*MinLateStart, LateStart);
3260 }
3261 }
3262
3263 SUnit *BE = multipleIterations(I, DAG);
3264 for (const auto &Dep : SU->Preds) {
3265 // For instruction that requires multiple iterations, make sure that
3266 // the dependent instruction is not scheduled past the definition.
3267 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3268 !SU->isPred(I))
3269 *MinLateStart = std::min(*MinLateStart, cycle);
3270 }
3271 }
3272 }
3273}
3274
3275/// Order the instructions within a cycle so that the definitions occur
3276/// before the uses. Returns true if the instruction is added to the start
3277/// of the list, or false if added to the end.
3279 std::deque<SUnit *> &Insts) const {
3280 MachineInstr *MI = SU->getInstr();
3281 bool OrderBeforeUse = false;
3282 bool OrderAfterDef = false;
3283 bool OrderBeforeDef = false;
3284 unsigned MoveDef = 0;
3285 unsigned MoveUse = 0;
3286 int StageInst1 = stageScheduled(SU);
3287 const SwingSchedulerDDG *DDG = SSD->getDDG();
3288
3289 unsigned Pos = 0;
3290 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3291 ++I, ++Pos) {
3292 for (MachineOperand &MO : MI->operands()) {
3293 if (!MO.isReg() || !MO.getReg().isVirtual())
3294 continue;
3295
3296 Register Reg = MO.getReg();
3297 unsigned BasePos, OffsetPos;
3298 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3299 if (MI->getOperand(BasePos).getReg() == Reg)
3300 if (Register NewReg = SSD->getInstrBaseReg(SU))
3301 Reg = NewReg;
3302 bool Reads, Writes;
3303 std::tie(Reads, Writes) =
3304 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3305 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3306 OrderBeforeUse = true;
3307 if (MoveUse == 0)
3308 MoveUse = Pos;
3309 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3310 // Add the instruction after the scheduled instruction.
3311 OrderAfterDef = true;
3312 MoveDef = Pos;
3313 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3314 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3315 OrderBeforeUse = true;
3316 if (MoveUse == 0)
3317 MoveUse = Pos;
3318 } else {
3319 OrderAfterDef = true;
3320 MoveDef = Pos;
3321 }
3322 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3323 OrderBeforeUse = true;
3324 if (MoveUse == 0)
3325 MoveUse = Pos;
3326 if (MoveUse != 0) {
3327 OrderAfterDef = true;
3328 MoveDef = Pos - 1;
3329 }
3330 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3331 // Add the instruction before the scheduled instruction.
3332 OrderBeforeUse = true;
3333 if (MoveUse == 0)
3334 MoveUse = Pos;
3335 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3336 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3337 if (MoveUse == 0) {
3338 OrderBeforeDef = true;
3339 MoveUse = Pos;
3340 }
3341 }
3342 }
3343 // Check for order dependences between instructions. Make sure the source
3344 // is ordered before the destination.
3345 for (auto &OE : DDG->getOutEdges(SU)) {
3346 if (OE.getDst() != *I)
3347 continue;
3348 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3349 OrderBeforeUse = true;
3350 if (Pos < MoveUse)
3351 MoveUse = Pos;
3352 }
3353 // We did not handle HW dependences in previous for loop,
3354 // and we normally set Latency = 0 for Anti/Output deps,
3355 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3356 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3357 stageScheduled(*I) == StageInst1) {
3358 OrderBeforeUse = true;
3359 if ((MoveUse == 0) || (Pos < MoveUse))
3360 MoveUse = Pos;
3361 }
3362 }
3363 for (auto &IE : DDG->getInEdges(SU)) {
3364 if (IE.getSrc() != *I)
3365 continue;
3366 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3367 stageScheduled(*I) == StageInst1) {
3368 OrderAfterDef = true;
3369 MoveDef = Pos;
3370 }
3371 }
3372 }
3373
3374 // A circular dependence.
3375 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3376 OrderBeforeUse = false;
3377
3378 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3379 // to a loop-carried dependence.
3380 if (OrderBeforeDef)
3381 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3382
3383 // The uncommon case when the instruction order needs to be updated because
3384 // there is both a use and def.
3385 if (OrderBeforeUse && OrderAfterDef) {
3386 SUnit *UseSU = Insts.at(MoveUse);
3387 SUnit *DefSU = Insts.at(MoveDef);
3388 if (MoveUse > MoveDef) {
3389 Insts.erase(Insts.begin() + MoveUse);
3390 Insts.erase(Insts.begin() + MoveDef);
3391 } else {
3392 Insts.erase(Insts.begin() + MoveDef);
3393 Insts.erase(Insts.begin() + MoveUse);
3394 }
3395 orderDependence(SSD, UseSU, Insts);
3396 orderDependence(SSD, SU, Insts);
3397 orderDependence(SSD, DefSU, Insts);
3398 return;
3399 }
3400 // Put the new instruction first if there is a use in the list. Otherwise,
3401 // put it at the end of the list.
3402 if (OrderBeforeUse)
3403 Insts.push_front(SU);
3404 else
3405 Insts.push_back(SU);
3406}
3407
3408/// Return true if the scheduled Phi has a loop carried operand.
3410 MachineInstr &Phi) const {
3411 if (!Phi.isPHI())
3412 return false;
3413 assert(Phi.isPHI() && "Expecting a Phi.");
3414 SUnit *DefSU = SSD->getSUnit(&Phi);
3415 unsigned DefCycle = cycleScheduled(DefSU);
3416 int DefStage = stageScheduled(DefSU);
3417
3418 Register InitVal;
3419 Register LoopVal;
3420 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3421 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3422 if (!UseSU)
3423 return true;
3424 if (UseSU->getInstr()->isPHI())
3425 return true;
3426 unsigned LoopCycle = cycleScheduled(UseSU);
3427 int LoopStage = stageScheduled(UseSU);
3428 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3429}
3430
3431/// Return true if the instruction is a definition that is loop carried
3432/// and defines the use on the next iteration.
3433/// v1 = phi(v2, v3)
3434/// (Def) v3 = op v1
3435/// (MO) = v1
3436/// If MO appears before Def, then v1 and v3 may get assigned to the same
3437/// register.
3439 MachineInstr *Def,
3440 MachineOperand &MO) const {
3441 if (!MO.isReg())
3442 return false;
3443 if (Def->isPHI())
3444 return false;
3445 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3446 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3447 return false;
3448 if (!isLoopCarried(SSD, *Phi))
3449 return false;
3450 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3451 for (MachineOperand &DMO : Def->all_defs()) {
3452 if (DMO.getReg() == LoopReg)
3453 return true;
3454 }
3455 return false;
3456}
3457
3458/// Return true if all scheduled predecessors are loop-carried output/order
3459/// dependencies.
3461 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3462 for (const auto &IE : DDG->getInEdges(SU))
3463 if (InstrToCycle.count(IE.getSrc()))
3464 return false;
3465 return true;
3466}
3467
3468/// Determine transitive dependences of unpipelineable instructions
3471 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3472 SmallVector<SUnit *, 8> Worklist;
3473
3474 for (auto &SU : SSD->SUnits)
3475 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3476 Worklist.push_back(&SU);
3477
3478 const SwingSchedulerDDG *DDG = SSD->getDDG();
3479 while (!Worklist.empty()) {
3480 auto SU = Worklist.pop_back_val();
3481 if (DoNotPipeline.count(SU))
3482 continue;
3483 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3484 DoNotPipeline.insert(SU);
3485 for (const auto &IE : DDG->getInEdges(SU))
3486 Worklist.push_back(IE.getSrc());
3487
3488 // To preserve previous behavior and prevent regression
3489 // FIXME: Remove if this doesn't have significant impact on
3490 for (const auto &OE : DDG->getOutEdges(SU))
3491 if (OE.getDistance() == 1)
3492 Worklist.push_back(OE.getDst());
3493 }
3494 return DoNotPipeline;
3495}
3496
3497// Determine all instructions upon which any unpipelineable instruction depends
3498// and ensure that they are in stage 0. If unable to do so, return false.
3501 SmallPtrSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
3502
3503 int NewLastCycle = INT_MIN;
3504 for (SUnit &SU : SSD->SUnits) {
3505 if (!SU.isInstr())
3506 continue;
3507 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3508 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3509 continue;
3510 }
3511
3512 // Put the non-pipelined instruction as early as possible in the schedule
3513 int NewCycle = getFirstCycle();
3514 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3515 if (IE.getDistance() == 0)
3516 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3517
3518 // To preserve previous behavior and prevent regression
3519 // FIXME: Remove if this doesn't have significant impact on performance
3520 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3521 if (OE.getDistance() == 1)
3522 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3523
3524 int OldCycle = InstrToCycle[&SU];
3525 if (OldCycle != NewCycle) {
3526 InstrToCycle[&SU] = NewCycle;
3527 auto &OldS = getInstructions(OldCycle);
3528 llvm::erase(OldS, &SU);
3529 getInstructions(NewCycle).emplace_back(&SU);
3530 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3531 << ") is not pipelined; moving from cycle " << OldCycle
3532 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3533 }
3534
3535 // We traverse the SUs in the order of the original basic block. Computing
3536 // NewCycle in this order normally works fine because all dependencies
3537 // (except for loop-carried dependencies) don't violate the original order.
3538 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3539 // break it. That is, there may be exist an artificial dependency from
3540 // bottom to top. In such a case, NewCycle may become too large to be
3541 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3542 // following case:
3543 //
3544 // | Inst0 <-+
3545 // SU order | | artificial dep
3546 // | Inst1 --+
3547 // v
3548 //
3549 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3550 // Inst0 must be greater than or equal to N so that Inst0 is not be
3551 // scheduled at Stage 0. In such cases, we reject this schedule at this
3552 // time.
3553 // FIXME: The reason for this is the existence of artificial dependencies
3554 // that are contradict to the original SU order. If ignoring artificial
3555 // dependencies does not affect correctness, then it is better to ignore
3556 // them.
3557 if (FirstCycle + InitiationInterval <= NewCycle)
3558 return false;
3559
3560 NewLastCycle = std::max(NewLastCycle, NewCycle);
3561 }
3562 LastCycle = NewLastCycle;
3563 return true;
3564}
3565
3566// Check if the generated schedule is valid. This function checks if
3567// an instruction that uses a physical register is scheduled in a
3568// different stage than the definition. The pipeliner does not handle
3569// physical register values that may cross a basic block boundary.
3570// Furthermore, if a physical def/use pair is assigned to the same
3571// cycle, orderDependence does not guarantee def/use ordering, so that
3572// case should be considered invalid. (The test checks for both
3573// earlier and same-cycle use to be more robust.)
3575 for (SUnit &SU : SSD->SUnits) {
3576 if (!SU.hasPhysRegDefs)
3577 continue;
3578 int StageDef = stageScheduled(&SU);
3579 int CycleDef = InstrToCycle[&SU];
3580 assert(StageDef != -1 && "Instruction should have been scheduled.");
3581 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3582 SUnit *Dst = OE.getDst();
3583 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3584 if (OE.getReg().isPhysical()) {
3585 if (stageScheduled(Dst) != StageDef)
3586 return false;
3587 if (InstrToCycle[Dst] <= CycleDef)
3588 return false;
3589 }
3590 }
3591 }
3592 return true;
3593}
3594
3595/// A property of the node order in swing-modulo-scheduling is
3596/// that for nodes outside circuits the following holds:
3597/// none of them is scheduled after both a successor and a
3598/// predecessor.
3599/// The method below checks whether the property is met.
3600/// If not, debug information is printed and statistics information updated.
3601/// Note that we do not use an assert statement.
3602/// The reason is that although an invalid node order may prevent
3603/// the pipeliner from finding a pipelined schedule for arbitrary II,
3604/// it does not lead to the generation of incorrect code.
3605void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3606
3607 // a sorted vector that maps each SUnit to its index in the NodeOrder
3608 typedef std::pair<SUnit *, unsigned> UnitIndex;
3609 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3610
3611 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3612 Indices.push_back(std::make_pair(NodeOrder[i], i));
3613
3614 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3615 return std::get<0>(i1) < std::get<0>(i2);
3616 };
3617
3618 // sort, so that we can perform a binary search
3619 llvm::sort(Indices, CompareKey);
3620
3621 bool Valid = true;
3622 (void)Valid;
3623 // for each SUnit in the NodeOrder, check whether
3624 // it appears after both a successor and a predecessor
3625 // of the SUnit. If this is the case, and the SUnit
3626 // is not part of circuit, then the NodeOrder is not
3627 // valid.
3628 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3629 SUnit *SU = NodeOrder[i];
3630 unsigned Index = i;
3631
3632 bool PredBefore = false;
3633 bool SuccBefore = false;
3634
3635 SUnit *Succ;
3636 SUnit *Pred;
3637 (void)Succ;
3638 (void)Pred;
3639
3640 for (const auto &IE : DDG->getInEdges(SU)) {
3641 SUnit *PredSU = IE.getSrc();
3642 unsigned PredIndex = std::get<1>(
3643 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3644 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3645 PredBefore = true;
3646 Pred = PredSU;
3647 break;
3648 }
3649 }
3650
3651 for (const auto &OE : DDG->getOutEdges(SU)) {
3652 SUnit *SuccSU = OE.getDst();
3653 // Do not process a boundary node, it was not included in NodeOrder,
3654 // hence not in Indices either, call to std::lower_bound() below will
3655 // return Indices.end().
3656 if (SuccSU->isBoundaryNode())
3657 continue;
3658 unsigned SuccIndex = std::get<1>(
3659 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3660 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3661 SuccBefore = true;
3662 Succ = SuccSU;
3663 break;
3664 }
3665 }
3666
3667 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3668 // instructions in circuits are allowed to be scheduled
3669 // after both a successor and predecessor.
3670 bool InCircuit = llvm::any_of(
3671 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3672 if (InCircuit)
3673 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3674 else {
3675 Valid = false;
3676 NumNodeOrderIssues++;
3677 LLVM_DEBUG(dbgs() << "Predecessor ");
3678 }
3679 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3680 << " are scheduled before node " << SU->NodeNum
3681 << "\n");
3682 }
3683 }
3684
3685 LLVM_DEBUG({
3686 if (!Valid)
3687 dbgs() << "Invalid node order found!\n";
3688 });
3689}
3690
3691/// Attempt to fix the degenerate cases when the instruction serialization
3692/// causes the register lifetimes to overlap. For example,
3693/// p' = store_pi(p, b)
3694/// = load p, offset
3695/// In this case p and p' overlap, which means that two registers are needed.
3696/// Instead, this function changes the load to use p' and updates the offset.
3697void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3698 Register OverlapReg;
3699 Register NewBaseReg;
3700 for (SUnit *SU : Instrs) {
3701 MachineInstr *MI = SU->getInstr();
3702 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3703 const MachineOperand &MO = MI->getOperand(i);
3704 // Look for an instruction that uses p. The instruction occurs in the
3705 // same cycle but occurs later in the serialized order.
3706 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3707 // Check that the instruction appears in the InstrChanges structure,
3708 // which contains instructions that can have the offset updated.
3710 InstrChanges.find(SU);
3711 if (It != InstrChanges.end()) {
3712 unsigned BasePos, OffsetPos;
3713 // Update the base register and adjust the offset.
3714 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3715 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3716 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3717 int64_t NewOffset =
3718 MI->getOperand(OffsetPos).getImm() - It->second.second;
3719 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3720 SU->setInstr(NewMI);
3721 MISUnitMap[NewMI] = SU;
3722 NewMIs[MI] = NewMI;
3723 }
3724 }
3725 OverlapReg = Register();
3726 NewBaseReg = Register();
3727 break;
3728 }
3729 // Look for an instruction of the form p' = op(p), which uses and defines
3730 // two virtual registers that get allocated to the same physical register.
3731 unsigned TiedUseIdx = 0;
3732 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3733 // OverlapReg is p in the example above.
3734 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3735 // NewBaseReg is p' in the example above.
3736 NewBaseReg = MI->getOperand(i).getReg();
3737 break;
3738 }
3739 }
3740 }
3741}
3742
3743std::deque<SUnit *>
3745 const std::deque<SUnit *> &Instrs) const {
3746 std::deque<SUnit *> NewOrderPhi;
3747 for (SUnit *SU : Instrs) {
3748 if (SU->getInstr()->isPHI())
3749 NewOrderPhi.push_back(SU);
3750 }
3751 std::deque<SUnit *> NewOrderI;
3752 for (SUnit *SU : Instrs) {
3753 if (!SU->getInstr()->isPHI())
3754 orderDependence(SSD, SU, NewOrderI);
3755 }
3756 llvm::append_range(NewOrderPhi, NewOrderI);
3757 return NewOrderPhi;
3758}
3759
3760/// After the schedule has been formed, call this function to combine
3761/// the instructions from the different stages/cycles. That is, this
3762/// function creates a schedule that represents a single iteration.
3764 // Move all instructions to the first stage from later stages.
3765 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3766 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3767 ++stage) {
3768 std::deque<SUnit *> &cycleInstrs =
3769 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3770 for (SUnit *SU : llvm::reverse(cycleInstrs))
3771 ScheduledInstrs[cycle].push_front(SU);
3772 }
3773 }
3774
3775 // Erase all the elements in the later stages. Only one iteration should
3776 // remain in the scheduled list, and it contains all the instructions.
3777 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3778 ScheduledInstrs.erase(cycle);
3779
3780 // Change the registers in instruction as specified in the InstrChanges
3781 // map. We need to use the new registers to create the correct order.
3782 for (const SUnit &SU : SSD->SUnits)
3783 SSD->applyInstrChange(SU.getInstr(), *this);
3784
3785 // Reorder the instructions in each cycle to fix and improve the
3786 // generated code.
3787 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3788 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3789 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3790 SSD->fixupRegisterOverlaps(cycleInstrs);
3791 }
3792
3793 LLVM_DEBUG(dump(););
3794}
3795
3797 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3798 << " depth " << MaxDepth << " col " << Colocate << "\n";
3799 for (const auto &I : Nodes)
3800 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3801 os << "\n";
3802}
3803
3804#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3805/// Print the schedule information to the given output.
3807 // Iterate over each cycle.
3808 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3809 // Iterate over each instruction in the cycle.
3810 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3811 for (SUnit *CI : cycleInstrs->second) {
3812 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3813 os << "(" << CI->NodeNum << ") ";
3814 CI->getInstr()->print(os);
3815 os << "\n";
3816 }
3817 }
3818}
3819
3820/// Utility function used for debugging to print the schedule.
3823
3824void ResourceManager::dumpMRT() const {
3825 LLVM_DEBUG({
3826 if (UseDFA)
3827 return;
3828 std::stringstream SS;
3829 SS << "MRT:\n";
3830 SS << std::setw(4) << "Slot";
3831 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3832 SS << std::setw(3) << I;
3833 SS << std::setw(7) << "#Mops"
3834 << "\n";
3835 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3836 SS << std::setw(4) << Slot;
3837 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3838 SS << std::setw(3) << MRT[Slot][I];
3839 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3840 }
3841 dbgs() << SS.str();
3842 });
3843}
3844#endif
3845
3847 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3848 unsigned ProcResourceID = 0;
3849
3850 // We currently limit the resource kinds to 64 and below so that we can use
3851 // uint64_t for Masks
3852 assert(SM.getNumProcResourceKinds() < 64 &&
3853 "Too many kinds of resources, unsupported");
3854 // Create a unique bitmask for every processor resource unit.
3855 // Skip resource at index 0, since it always references 'InvalidUnit'.
3856 Masks.resize(SM.getNumProcResourceKinds());
3857 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3859 if (Desc.SubUnitsIdxBegin)
3860 continue;
3861 Masks[I] = 1ULL << ProcResourceID;
3862 ProcResourceID++;
3863 }
3864 // Create a unique bitmask for every processor resource group.
3865 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3867 if (!Desc.SubUnitsIdxBegin)
3868 continue;
3869 Masks[I] = 1ULL << ProcResourceID;
3870 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3871 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3872 ProcResourceID++;
3873 }
3874 LLVM_DEBUG({
3875 if (SwpShowResMask) {
3876 dbgs() << "ProcResourceDesc:\n";
3877 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3878 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3879 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3880 ProcResource->Name, I, Masks[I],
3881 ProcResource->NumUnits);
3882 }
3883 dbgs() << " -----------------\n";
3884 }
3885 });
3886}
3887
3889 LLVM_DEBUG({
3890 if (SwpDebugResource)
3891 dbgs() << "canReserveResources:\n";
3892 });
3893 if (UseDFA)
3894 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3895 ->canReserveResources(&SU.getInstr()->getDesc());
3896
3897 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3898 if (!SCDesc->isValid()) {
3899 LLVM_DEBUG({
3900 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3901 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3902 });
3903 return true;
3904 }
3905
3906 reserveResources(SCDesc, Cycle);
3907 bool Result = !isOverbooked();
3908 unreserveResources(SCDesc, Cycle);
3909
3910 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3911 return Result;
3912}
3913
3914void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3915 LLVM_DEBUG({
3916 if (SwpDebugResource)
3917 dbgs() << "reserveResources:\n";
3918 });
3919 if (UseDFA)
3920 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3921 ->reserveResources(&SU.getInstr()->getDesc());
3922
3923 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3924 if (!SCDesc->isValid()) {
3925 LLVM_DEBUG({
3926 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3927 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3928 });
3929 return;
3930 }
3931
3932 reserveResources(SCDesc, Cycle);
3933
3934 LLVM_DEBUG({
3935 if (SwpDebugResource) {
3936 dumpMRT();
3937 dbgs() << "reserveResources: done!\n\n";
3938 }
3939 });
3940}
3941
3942void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3943 int Cycle) {
3944 assert(!UseDFA);
3945 for (const MCWriteProcResEntry &PRE : make_range(
3946 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3947 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3948 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3949
3950 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3951 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3952}
3953
3954void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3955 int Cycle) {
3956 assert(!UseDFA);
3957 for (const MCWriteProcResEntry &PRE : make_range(
3958 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3959 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3960 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3961
3962 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3963 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3964}
3965
3966bool ResourceManager::isOverbooked() const {
3967 assert(!UseDFA);
3968 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3969 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3970 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3971 if (MRT[Slot][I] > Desc->NumUnits)
3972 return true;
3973 }
3974 if (NumScheduledMops[Slot] > IssueWidth)
3975 return true;
3976 }
3977 return false;
3978}
3979
3980int ResourceManager::calculateResMIIDFA() const {
3981 assert(UseDFA);
3982
3983 // Sort the instructions by the number of available choices for scheduling,
3984 // least to most. Use the number of critical resources as the tie breaker.
3985 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3986 for (SUnit &SU : DAG->SUnits)
3987 FUS.calcCriticalResources(*SU.getInstr());
3989 FuncUnitOrder(FUS);
3990
3991 for (SUnit &SU : DAG->SUnits)
3992 FuncUnitOrder.push(SU.getInstr());
3993
3995 Resources.push_back(
3996 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3997
3998 while (!FuncUnitOrder.empty()) {
3999 MachineInstr *MI = FuncUnitOrder.top();
4000 FuncUnitOrder.pop();
4001 if (TII->isZeroCost(MI->getOpcode()))
4002 continue;
4003
4004 // Attempt to reserve the instruction in an existing DFA. At least one
4005 // DFA is needed for each cycle.
4006 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4007 unsigned ReservedCycles = 0;
4008 auto *RI = Resources.begin();
4009 auto *RE = Resources.end();
4010 LLVM_DEBUG({
4011 dbgs() << "Trying to reserve resource for " << NumCycles
4012 << " cycles for \n";
4013 MI->dump();
4014 });
4015 for (unsigned C = 0; C < NumCycles; ++C)
4016 while (RI != RE) {
4017 if ((*RI)->canReserveResources(*MI)) {
4018 (*RI)->reserveResources(*MI);
4019 ++ReservedCycles;
4020 break;
4021 }
4022 RI++;
4023 }
4024 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4025 << ", NumCycles:" << NumCycles << "\n");
4026 // Add new DFAs, if needed, to reserve resources.
4027 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4029 << "NewResource created to reserve resources"
4030 << "\n");
4031 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4032 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4033 NewResource->reserveResources(*MI);
4034 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4035 }
4036 }
4037
4038 int Resmii = Resources.size();
4039 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4040 return Resmii;
4041}
4042
4044 if (UseDFA)
4045 return calculateResMIIDFA();
4046
4047 // Count each resource consumption and divide it by the number of units.
4048 // ResMII is the max value among them.
4049
4050 int NumMops = 0;
4051 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4052 for (SUnit &SU : DAG->SUnits) {
4053 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4054 continue;
4055
4056 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4057 if (!SCDesc->isValid())
4058 continue;
4059
4060 LLVM_DEBUG({
4061 if (SwpDebugResource) {
4062 DAG->dumpNode(SU);
4063 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4064 << " WriteProcRes: ";
4065 }
4066 });
4067 NumMops += SCDesc->NumMicroOps;
4068 for (const MCWriteProcResEntry &PRE :
4069 make_range(STI->getWriteProcResBegin(SCDesc),
4070 STI->getWriteProcResEnd(SCDesc))) {
4071 LLVM_DEBUG({
4072 if (SwpDebugResource) {
4073 const MCProcResourceDesc *Desc =
4074 SM.getProcResource(PRE.ProcResourceIdx);
4075 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4076 }
4077 });
4078 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4079 }
4080 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4081 }
4082
4083 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4084 LLVM_DEBUG({
4085 if (SwpDebugResource)
4086 dbgs() << "#Mops: " << NumMops << ", "
4087 << "IssueWidth: " << IssueWidth << ", "
4088 << "Cycles: " << Result << "\n";
4089 });
4090
4091 LLVM_DEBUG({
4092 if (SwpDebugResource) {
4093 std::stringstream SS;
4094 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4095 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4096 << "\n";
4097 dbgs() << SS.str();
4098 }
4099 });
4100 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4101 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4102 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4103 LLVM_DEBUG({
4104 if (SwpDebugResource) {
4105 std::stringstream SS;
4106 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4107 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4108 << std::setw(10) << Cycles << "\n";
4109 dbgs() << SS.str();
4110 }
4111 });
4112 if (Cycles > Result)
4113 Result = Cycles;
4114 }
4115 return Result;
4116}
4117
4119 InitiationInterval = II;
4120 DFAResources.clear();
4121 DFAResources.resize(II);
4122 for (auto &I : DFAResources)
4123 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4124 MRT.clear();
4125 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4126 NumScheduledMops.clear();
4127 NumScheduledMops.resize(II);
4128}
4129
4130bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4131 if (Pred.isArtificial() || Dst->isBoundaryNode())
4132 return true;
4133 // Currently, dependence that is an anti-dependences but not a loop-carried is
4134 // also ignored. This behavior is preserved to prevent regression.
4135 // FIXME: Remove if this doesn't have significant impact on performance
4136 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4137}
4138
4139SwingSchedulerDDG::SwingSchedulerDDGEdges &
4140SwingSchedulerDDG::getEdges(const SUnit *SU) {
4141 if (SU == EntrySU)
4142 return EntrySUEdges;
4143 if (SU == ExitSU)
4144 return ExitSUEdges;
4145 return EdgesVec[SU->NodeNum];
4146}
4147
4148const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4149SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4150 if (SU == EntrySU)
4151 return EntrySUEdges;
4152 if (SU == ExitSU)
4153 return ExitSUEdges;
4154 return EdgesVec[SU->NodeNum];
4155}
4156
4157void SwingSchedulerDDG::addEdge(const SUnit *SU,
4158 const SwingSchedulerDDGEdge &Edge) {
4159 assert(!Edge.isValidationOnly() &&
4160 "Validation-only edges are not expected here.");
4161 auto &Edges = getEdges(SU);
4162 if (Edge.getSrc() == SU)
4163 Edges.Succs.push_back(Edge);
4164 else
4165 Edges.Preds.push_back(Edge);
4166}
4167
4168void SwingSchedulerDDG::initEdges(SUnit *SU) {
4169 for (const auto &PI : SU->Preds) {
4170 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4171 /*IsValidationOnly=*/false);
4172 addEdge(SU, Edge);
4173 }
4174
4175 for (const auto &SI : SU->Succs) {
4176 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4177 /*IsValidationOnly=*/false);
4178 addEdge(SU, Edge);
4179 }
4180}
4181
4182SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4183 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4184 : EntrySU(EntrySU), ExitSU(ExitSU) {
4185 EdgesVec.resize(SUnits.size());
4186
4187 // Add non-loop-carried edges based on the DAG.
4188 initEdges(EntrySU);
4189 initEdges(ExitSU);
4190 for (auto &SU : SUnits)
4191 initEdges(&SU);
4192
4193 // Add loop-carried edges, which are not represented in the DAG.
4194 for (SUnit &SU : SUnits) {
4195 SUnit *Src = &SU;
4196 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4197 SDep Base(Src, SDep::Barrier);
4198 Base.setLatency(1);
4199 for (SUnit *Dst : *OD) {
4200 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4201 /*IsValidationOnly=*/true);
4202 Edge.setDistance(1);
4203 ValidationOnlyEdges.push_back(Edge);
4204 }
4205 }
4206 }
4207}
4208
4211 return getEdges(SU).Preds;
4212}
4213
4216 return getEdges(SU).Succs;
4217}
4218
4219/// Check if \p Schedule doesn't violate the validation-only dependencies.
4221 unsigned II = Schedule.getInitiationInterval();
4222
4223 auto ExpandCycle = [&](SUnit *SU) {
4224 int Stage = Schedule.stageScheduled(SU);
4225 int Cycle = Schedule.cycleScheduled(SU);
4226 return Cycle + (Stage * II);
4227 };
4228
4229 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4230 SUnit *Src = Edge.getSrc();
4231 SUnit *Dst = Edge.getDst();
4232 if (!Src->isInstr() || !Dst->isInstr())
4233 continue;
4234 int CycleSrc = ExpandCycle(Src);
4235 int CycleDst = ExpandCycle(Dst);
4236 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4237 if (CycleSrc > MaxLateStart) {
4238 LLVM_DEBUG({
4239 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4240 << Dst->NodeNum << "\n";
4241 });
4242 return false;
4243 }
4244 }
4245 return true;
4246}
4247
4248void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4249 const TargetInstrInfo *TII) {
4250 for (SUnit &SU : SUnits) {
4251 SUnit *Src = &SU;
4252 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4253 SDep Dep(Src, SDep::Barrier);
4254 Dep.setLatency(1);
4255 for (SUnit *Dst : *OrderDep) {
4256 SUnit *From = Src;
4257 SUnit *To = Dst;
4258 if (From->NodeNum > To->NodeNum)
4259 std::swap(From, To);
4260
4261 // Add a forward edge if the following conditions are met:
4262 //
4263 // - The instruction of the source node (FromMI) may read memory.
4264 // - The instruction of the target node (ToMI) may modify memory, but
4265 // does not read it.
4266 // - Neither instruction is a global barrier.
4267 // - The load appears before the store in the original basic block.
4268 // - There are no barrier or store instructions between the two nodes.
4269 // - The target node is unreachable from the source node in the current
4270 // DAG.
4271 //
4272 // TODO: These conditions are inherited from a previous implementation,
4273 // and some may no longer be necessary. For now, we conservatively
4274 // retain all of them to avoid regressions, but the logic could
4275 // potentially be simplified
4276 MachineInstr *FromMI = From->getInstr();
4277 MachineInstr *ToMI = To->getInstr();
4278 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4279 !TII->isGlobalMemoryObject(FromMI) &&
4280 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4281 SDep Pred = Dep;
4282 Pred.setSUnit(From);
4283 To->addPred(Pred);
4284 }
4285 }
4286 }
4287 }
4288}
4289
4291 const MachineRegisterInfo *MRI) const {
4292 const auto *Order = getOrderDepOrNull(SU);
4293
4294 if (!Order)
4295 return;
4296
4297 const auto DumpSU = [](const SUnit *SU) {
4298 std::ostringstream OSS;
4299 OSS << "SU(" << SU->NodeNum << ")";
4300 return OSS.str();
4301 };
4302
4303 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4304 << " Order\n";
4305 for (SUnit *Dst : *Order)
4306 dbgs() << " " << DumpSU(Dst) << "\n";
4307}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:687
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD, bool PerformCheapCheck)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
bool erase(const KeyT &Val)
Definition: DenseMap.h:319
bool empty() const
Definition: DenseMap.h:119
iterator end()
Definition: DenseMap.h:87
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition: Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
A possibly irreducible generalization of a Loop.
bool getIncrementValue(const MachineInstr &MI, int &Value) const override
If the instruction is an increment of a constant value, return the amount.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:428
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:644
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:603
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:64
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1445
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1443
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1451
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:899
A single uniqued string.
Definition: Metadata.h:720
LLVM_ABI StringRef getString() const
Definition: Metadata.cpp:617
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:754
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:584
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:798
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:928
LLVM_ABI void dump() const
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:99
void dump() const
Definition: Pass.cpp:146
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF, bool Rev=false)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition: Register.h:102
constexpr bool isValid() const
Definition: Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:74
void setSUnit(SUnit *SU)
Definition: ScheduleDAG.h:510
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:249
unsigned NumPreds
Definition: ScheduleDAG.h:279
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:387
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:277
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:391
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:458
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:312
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:367
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:301
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:270
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:269
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:399
A ScheduleDAG for scheduling lists of MachineInstr.
virtual void finishBlock()
Cleans up after scheduling in the given block.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:587
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:588
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:585
SUnit EntrySU
Special node for the region entry.
Definition: ScheduleDAG.h:589
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:586
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:590
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
void insert_range(Range &&R)
Definition: SetVector.h:193
typename vector_type::const_iterator iterator
Definition: SetVector.h:71
void clear()
Completely clear the SetVector.
Definition: SetVector.h:284
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:279
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:269
SlotIndexes pass.
Definition: SlotIndexes.h:298
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:532
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:418
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
iterator end() const
Definition: SmallPtrSet.h:499
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
The main class in the implementation of the target independent window scheduler.
unsigned getPosition() const
Definition: CommandLine.h:307
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:712
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition: SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:477
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:126
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2013
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Represents loop-carried dependencies.
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:123
bool isValid() const
Definition: MCSchedule.h:141
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:366
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:355
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:68
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.