LLVM 21.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
79#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
84#include <algorithm>
85#include <cassert>
86#include <climits>
87#include <cstdint>
88#include <deque>
89#include <functional>
90#include <iomanip>
91#include <iterator>
92#include <map>
93#include <memory>
94#include <sstream>
95#include <tuple>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE "pipeliner"
102
103STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
104STATISTIC(NumPipelined, "Number of loops software pipelined");
105STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
106STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
107STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
108STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
109STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
110STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
111STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
112STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
113STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197namespace llvm {
198
199// A command line option to enable the CopyToPhi DAG mutation.
201 cl::init(true),
202 cl::desc("Enable CopyToPhi DAG Mutation"));
203
204/// A command line argument to force pipeliner to use specified issue
205/// width.
207 "pipeliner-force-issue-width",
208 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
209 cl::init(-1));
210
211/// A command line argument to set the window scheduling option.
214 cl::desc("Set how to use window scheduling algorithm."),
216 "Turn off window algorithm."),
218 "Use window algorithm after SMS algorithm fails."),
220 "Use window algorithm instead of SMS algorithm.")));
221
222} // end namespace llvm
223
224unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
225char MachinePipeliner::ID = 0;
226#ifndef NDEBUG
228#endif
230
232 "Modulo Software Pipelining", false, false)
238 "Modulo Software Pipelining", false, false)
239
240/// The "main" function for implementing Swing Modulo Scheduling.
241bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
242 if (skipFunction(mf.getFunction()))
243 return false;
244
245 if (!EnableSWP)
246 return false;
247
248 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
250 return false;
251
252 if (!mf.getSubtarget().enableMachinePipeliner())
253 return false;
254
255 // Cannot pipeline loops without instruction itineraries if we are using
256 // DFA for the pipeliner.
257 if (mf.getSubtarget().useDFAforSMS() &&
258 (!mf.getSubtarget().getInstrItineraryData() ||
259 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
260 return false;
261
262 MF = &mf;
263 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
264 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
265 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
266 TII = MF->getSubtarget().getInstrInfo();
267 RegClassInfo.runOnMachineFunction(*MF);
268
269 for (const auto &L : *MLI)
270 scheduleLoop(*L);
271
272 return false;
273}
274
275/// Attempt to perform the SMS algorithm on the specified loop. This function is
276/// the main entry point for the algorithm. The function identifies candidate
277/// loops, calculates the minimum initiation interval, and attempts to schedule
278/// the loop.
279bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
280 bool Changed = false;
281 for (const auto &InnerLoop : L)
282 Changed |= scheduleLoop(*InnerLoop);
283
284#ifndef NDEBUG
285 // Stop trying after reaching the limit (if any).
286 int Limit = SwpLoopLimit;
287 if (Limit >= 0) {
288 if (NumTries >= SwpLoopLimit)
289 return Changed;
290 NumTries++;
291 }
292#endif
293
294 setPragmaPipelineOptions(L);
295 if (!canPipelineLoop(L)) {
296 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
297 ORE->emit([&]() {
298 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
299 L.getStartLoc(), L.getHeader())
300 << "Failed to pipeline loop";
301 });
302
303 LI.LoopPipelinerInfo.reset();
304 return Changed;
305 }
306
307 ++NumTrytoPipeline;
308 if (useSwingModuloScheduler())
309 Changed = swingModuloScheduler(L);
310
311 if (useWindowScheduler(Changed))
312 Changed = runWindowScheduler(L);
313
314 LI.LoopPipelinerInfo.reset();
315 return Changed;
316}
317
318void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
319 // Reset the pragma for the next loop in iteration.
320 disabledByPragma = false;
321 II_setByPragma = 0;
322
323 MachineBasicBlock *LBLK = L.getTopBlock();
324
325 if (LBLK == nullptr)
326 return;
327
328 const BasicBlock *BBLK = LBLK->getBasicBlock();
329 if (BBLK == nullptr)
330 return;
331
332 const Instruction *TI = BBLK->getTerminator();
333 if (TI == nullptr)
334 return;
335
336 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
337 if (LoopID == nullptr)
338 return;
339
340 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
341 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
342
343 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
344 MDNode *MD = dyn_cast<MDNode>(MDO);
345
346 if (MD == nullptr)
347 continue;
348
349 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
350
351 if (S == nullptr)
352 continue;
353
354 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
355 assert(MD->getNumOperands() == 2 &&
356 "Pipeline initiation interval hint metadata should have two operands.");
358 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
359 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
360 } else if (S->getString() == "llvm.loop.pipeline.disable") {
361 disabledByPragma = true;
362 }
363 }
364}
365
366/// Return true if the loop can be software pipelined. The algorithm is
367/// restricted to loops with a single basic block. Make sure that the
368/// branch in the loop can be analyzed.
369bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
370 if (L.getNumBlocks() != 1) {
371 ORE->emit([&]() {
372 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
373 L.getStartLoc(), L.getHeader())
374 << "Not a single basic block: "
375 << ore::NV("NumBlocks", L.getNumBlocks());
376 });
377 return false;
378 }
379
380 if (disabledByPragma) {
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "Disabled by Pragma.";
385 });
386 return false;
387 }
388
389 // Check if the branch can't be understood because we can't do pipelining
390 // if that's the case.
391 LI.TBB = nullptr;
392 LI.FBB = nullptr;
393 LI.BrCond.clear();
394 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
395 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
396 NumFailBranch++;
397 ORE->emit([&]() {
398 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
399 L.getStartLoc(), L.getHeader())
400 << "The branch can't be understood";
401 });
402 return false;
403 }
404
405 LI.LoopInductionVar = nullptr;
406 LI.LoopCompare = nullptr;
408 if (!LI.LoopPipelinerInfo) {
409 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
410 NumFailLoop++;
411 ORE->emit([&]() {
412 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
413 L.getStartLoc(), L.getHeader())
414 << "The loop structure is not supported";
415 });
416 return false;
417 }
418
419 if (!L.getLoopPreheader()) {
420 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
421 NumFailPreheader++;
422 ORE->emit([&]() {
423 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
424 L.getStartLoc(), L.getHeader())
425 << "No loop preheader found";
426 });
427 return false;
428 }
429
430 // Remove any subregisters from inputs to phi nodes.
431 preprocessPhiNodes(*L.getHeader());
432 return true;
433}
434
435void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
437 SlotIndexes &Slots =
438 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
439
440 for (MachineInstr &PI : B.phis()) {
441 MachineOperand &DefOp = PI.getOperand(0);
442 assert(DefOp.getSubReg() == 0);
443 auto *RC = MRI.getRegClass(DefOp.getReg());
444
445 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
446 MachineOperand &RegOp = PI.getOperand(i);
447 if (RegOp.getSubReg() == 0)
448 continue;
449
450 // If the operand uses a subregister, replace it with a new register
451 // without subregisters, and generate a copy to the new register.
452 Register NewReg = MRI.createVirtualRegister(RC);
453 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
455 const DebugLoc &DL = PredB.findDebugLoc(At);
456 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
457 .addReg(RegOp.getReg(), getRegState(RegOp),
458 RegOp.getSubReg());
459 Slots.insertMachineInstrInMaps(*Copy);
460 RegOp.setReg(NewReg);
461 RegOp.setSubReg(0);
462 }
463 }
464}
465
466/// The SMS algorithm consists of the following main steps:
467/// 1. Computation and analysis of the dependence graph.
468/// 2. Ordering of the nodes (instructions).
469/// 3. Attempt to Schedule the loop.
470bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
471 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
472
474 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
476
477 MachineBasicBlock *MBB = L.getHeader();
478 // The kernel should not include any terminator instructions. These
479 // will be added back later.
480 SMS.startBlock(MBB);
481
482 // Compute the number of 'real' instructions in the basic block by
483 // ignoring terminators.
484 unsigned size = MBB->size();
486 E = MBB->instr_end();
487 I != E; ++I, --size)
488 ;
489
490 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
491 SMS.schedule();
492 SMS.exitRegion();
493
494 SMS.finishBlock();
495 return SMS.hasNewSchedule();
496}
497
507}
508
509bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
510 MachineSchedContext Context;
511 Context.MF = MF;
512 Context.MLI = MLI;
513 Context.MDT = MDT;
514 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
515 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
516 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
518 WindowScheduler WS(&Context, L);
519 return WS.run();
520}
521
522bool MachinePipeliner::useSwingModuloScheduler() {
523 // SwingModuloScheduler does not work when WindowScheduler is forced.
525}
526
527bool MachinePipeliner::useWindowScheduler(bool Changed) {
528 // WindowScheduler does not work for following cases:
529 // 1. when it is off.
530 // 2. when SwingModuloScheduler is successfully scheduled.
531 // 3. when pragma II is enabled.
532 if (II_setByPragma) {
533 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
534 "llvm.loop.pipeline.initiationinterval is set.\n");
535 return false;
536 }
537
540}
541
542void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
543 if (SwpForceII > 0)
544 MII = SwpForceII;
545 else if (II_setByPragma > 0)
546 MII = II_setByPragma;
547 else
548 MII = std::max(ResMII, RecMII);
549}
550
551void SwingSchedulerDAG::setMAX_II() {
552 if (SwpForceII > 0)
553 MAX_II = SwpForceII;
554 else if (II_setByPragma > 0)
555 MAX_II = II_setByPragma;
556 else
557 MAX_II = MII + SwpIISearchRange;
558}
559
560/// We override the schedule function in ScheduleDAGInstrs to implement the
561/// scheduling part of the Swing Modulo Scheduling algorithm.
563 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
564 buildSchedGraph(AA);
565 addLoopCarriedDependences(AA);
566 updatePhiDependences();
568 changeDependences();
569 postProcessDAG();
570 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
571 LLVM_DEBUG(dump());
572
573 NodeSetType NodeSets;
574 findCircuits(NodeSets);
575 NodeSetType Circuits = NodeSets;
576
577 // Calculate the MII.
578 unsigned ResMII = calculateResMII();
579 unsigned RecMII = calculateRecMII(NodeSets);
580
581 fuseRecs(NodeSets);
582
583 // This flag is used for testing and can cause correctness problems.
584 if (SwpIgnoreRecMII)
585 RecMII = 0;
586
587 setMII(ResMII, RecMII);
588 setMAX_II();
589
590 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
591 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
592
593 // Can't schedule a loop without a valid MII.
594 if (MII == 0) {
595 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
596 NumFailZeroMII++;
597 Pass.ORE->emit([&]() {
599 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
600 << "Invalid Minimal Initiation Interval: 0";
601 });
602 return;
603 }
604
605 // Don't pipeline large loops.
606 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
607 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
608 << ", we don't pipeline large loops\n");
609 NumFailLargeMaxMII++;
610 Pass.ORE->emit([&]() {
612 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
613 << "Minimal Initiation Interval too large: "
614 << ore::NV("MII", (int)MII) << " > "
615 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
616 << "Refer to -pipeliner-max-mii.";
617 });
618 return;
619 }
620
621 computeNodeFunctions(NodeSets);
622
623 registerPressureFilter(NodeSets);
624
625 colocateNodeSets(NodeSets);
626
627 checkNodeSets(NodeSets);
628
629 LLVM_DEBUG({
630 for (auto &I : NodeSets) {
631 dbgs() << " Rec NodeSet ";
632 I.dump();
633 }
634 });
635
636 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
637
638 groupRemainingNodes(NodeSets);
639
640 removeDuplicateNodes(NodeSets);
641
642 LLVM_DEBUG({
643 for (auto &I : NodeSets) {
644 dbgs() << " NodeSet ";
645 I.dump();
646 }
647 });
648
649 computeNodeOrder(NodeSets);
650
651 // check for node order issues
652 checkValidNodeOrder(Circuits);
653
654 SMSchedule Schedule(Pass.MF, this);
655 Scheduled = schedulePipeline(Schedule);
656
657 if (!Scheduled){
658 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
659 NumFailNoSchedule++;
660 Pass.ORE->emit([&]() {
662 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
663 << "Unable to find schedule";
664 });
665 return;
666 }
667
668 unsigned numStages = Schedule.getMaxStageCount();
669 // No need to generate pipeline if there are no overlapped iterations.
670 if (numStages == 0) {
671 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
672 NumFailZeroStage++;
673 Pass.ORE->emit([&]() {
675 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
676 << "No need to pipeline - no overlapped iterations in schedule.";
677 });
678 return;
679 }
680 // Check that the maximum stage count is less than user-defined limit.
681 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
682 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
683 << " : too many stages, abort\n");
684 NumFailLargeMaxStage++;
685 Pass.ORE->emit([&]() {
687 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
688 << "Too many stages in schedule: "
689 << ore::NV("numStages", (int)numStages) << " > "
690 << ore::NV("SwpMaxStages", SwpMaxStages)
691 << ". Refer to -pipeliner-max-stages.";
692 });
693 return;
694 }
695
696 Pass.ORE->emit([&]() {
698 Loop.getHeader())
699 << "Pipelined succesfully!";
700 });
701
702 // Generate the schedule as a ModuloSchedule.
703 DenseMap<MachineInstr *, int> Cycles, Stages;
704 std::vector<MachineInstr *> OrderedInsts;
705 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
706 ++Cycle) {
707 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
708 OrderedInsts.push_back(SU->getInstr());
709 Cycles[SU->getInstr()] = Cycle;
710 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
711 }
712 }
714 for (auto &KV : NewMIs) {
715 Cycles[KV.first] = Cycles[KV.second];
716 Stages[KV.first] = Stages[KV.second];
717 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
718 }
719
720 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
721 std::move(Stages));
723 assert(NewInstrChanges.empty() &&
724 "Cannot serialize a schedule with InstrChanges!");
726 MSTI.annotate();
727 return;
728 }
729 // The experimental code generator can't work if there are InstChanges.
730 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
731 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
732 MSE.expand();
733 } else if (MVECodeGen && NewInstrChanges.empty() &&
734 LoopPipelinerInfo->isMVEExpanderSupported() &&
736 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
737 MSE.expand();
738 } else {
739 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
740 MSE.expand();
741 MSE.cleanup();
742 }
743 ++NumPipelined;
744}
745
746/// Clean up after the software pipeliner runs.
748 for (auto &KV : NewMIs)
749 MF.deleteMachineInstr(KV.second);
750 NewMIs.clear();
751
752 // Call the superclass.
754}
755
756/// Return the register values for the operands of a Phi instruction.
757/// This function assume the instruction is a Phi.
759 unsigned &InitVal, unsigned &LoopVal) {
760 assert(Phi.isPHI() && "Expecting a Phi.");
761
762 InitVal = 0;
763 LoopVal = 0;
764 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
765 if (Phi.getOperand(i + 1).getMBB() != Loop)
766 InitVal = Phi.getOperand(i).getReg();
767 else
768 LoopVal = Phi.getOperand(i).getReg();
769
770 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
771}
772
773/// Return the Phi register value that comes the loop block.
774static unsigned getLoopPhiReg(const MachineInstr &Phi,
775 const MachineBasicBlock *LoopBB) {
776 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
777 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
778 return Phi.getOperand(i).getReg();
779 return 0;
780}
781
782/// Return true if SUb can be reached from SUa following the chain edges.
783static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
786 Worklist.push_back(SUa);
787 while (!Worklist.empty()) {
788 const SUnit *SU = Worklist.pop_back_val();
789 for (const auto &SI : SU->Succs) {
790 SUnit *SuccSU = SI.getSUnit();
791 if (SI.getKind() == SDep::Order) {
792 if (Visited.count(SuccSU))
793 continue;
794 if (SuccSU == SUb)
795 return true;
796 Worklist.push_back(SuccSU);
797 Visited.insert(SuccSU);
798 }
799 }
800 }
801 return false;
802}
803
804/// Return true if the instruction causes a chain between memory
805/// references before and after it.
807 return MI.isCall() || MI.mayRaiseFPException() ||
808 MI.hasUnmodeledSideEffects() ||
809 (MI.hasOrderedMemoryRef() &&
810 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
811}
812
813/// Return the underlying objects for the memory references of an instruction.
814/// This function calls the code in ValueTracking, but first checks that the
815/// instruction has a memory operand.
818 if (!MI->hasOneMemOperand())
819 return;
820 MachineMemOperand *MM = *MI->memoperands_begin();
821 if (!MM->getValue())
822 return;
823 getUnderlyingObjects(MM->getValue(), Objs);
824 for (const Value *V : Objs) {
825 if (!isIdentifiedObject(V)) {
826 Objs.clear();
827 return;
828 }
829 }
830}
831
832/// Add a chain edge between a load and store if the store can be an
833/// alias of the load on a subsequent iteration, i.e., a loop carried
834/// dependence. This code is very similar to the code in ScheduleDAGInstrs
835/// but that code doesn't create loop carried dependences.
836void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
840 for (auto &SU : SUnits) {
841 MachineInstr &MI = *SU.getInstr();
843 PendingLoads.clear();
844 else if (MI.mayLoad()) {
847 if (Objs.empty())
849 for (const auto *V : Objs) {
850 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
851 SUs.push_back(&SU);
852 }
853 } else if (MI.mayStore()) {
856 if (Objs.empty())
858 for (const auto *V : Objs) {
860 PendingLoads.find(V);
861 if (I == PendingLoads.end())
862 continue;
863 for (auto *Load : I->second) {
864 if (isSuccOrder(Load, &SU))
865 continue;
866 MachineInstr &LdMI = *Load->getInstr();
867 // First, perform the cheaper check that compares the base register.
868 // If they are the same and the load offset is less than the store
869 // offset, then mark the dependence as loop carried potentially.
870 const MachineOperand *BaseOp1, *BaseOp2;
871 int64_t Offset1, Offset2;
872 bool Offset1IsScalable, Offset2IsScalable;
873 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
874 Offset1IsScalable, TRI) &&
875 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
876 Offset2IsScalable, TRI)) {
877 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
878 Offset1IsScalable == Offset2IsScalable &&
879 (int)Offset1 < (int)Offset2) {
881 "What happened to the chain edge?");
882 SDep Dep(Load, SDep::Barrier);
883 Dep.setLatency(1);
884 SU.addPred(Dep);
885 continue;
886 }
887 }
888 // Second, the more expensive check that uses alias analysis on the
889 // base registers. If they alias, and the load offset is less than
890 // the store offset, the mark the dependence as loop carried.
891 if (!AA) {
892 SDep Dep(Load, SDep::Barrier);
893 Dep.setLatency(1);
894 SU.addPred(Dep);
895 continue;
896 }
897 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
898 MachineMemOperand *MMO2 = *MI.memoperands_begin();
899 if (!MMO1->getValue() || !MMO2->getValue()) {
900 SDep Dep(Load, SDep::Barrier);
901 Dep.setLatency(1);
902 SU.addPred(Dep);
903 continue;
904 }
905 if (MMO1->getValue() == MMO2->getValue() &&
906 MMO1->getOffset() <= MMO2->getOffset()) {
907 SDep Dep(Load, SDep::Barrier);
908 Dep.setLatency(1);
909 SU.addPred(Dep);
910 continue;
911 }
912 if (!AA->isNoAlias(
915 MMO2->getAAInfo()))) {
916 SDep Dep(Load, SDep::Barrier);
917 Dep.setLatency(1);
918 SU.addPred(Dep);
919 }
920 }
921 }
922 }
923 }
924}
925
926/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
927/// processes dependences for PHIs. This function adds true dependences
928/// from a PHI to a use, and a loop carried dependence from the use to the
929/// PHI. The loop carried dependence is represented as an anti dependence
930/// edge. This function also removes chain dependences between unrelated
931/// PHIs.
932void SwingSchedulerDAG::updatePhiDependences() {
933 SmallVector<SDep, 4> RemoveDeps;
935
936 // Iterate over each DAG node.
937 for (SUnit &I : SUnits) {
938 RemoveDeps.clear();
939 // Set to true if the instruction has an operand defined by a Phi.
940 unsigned HasPhiUse = 0;
941 unsigned HasPhiDef = 0;
942 MachineInstr *MI = I.getInstr();
943 // Iterate over each operand, and we process the definitions.
944 for (const MachineOperand &MO : MI->operands()) {
945 if (!MO.isReg())
946 continue;
947 Register Reg = MO.getReg();
948 if (MO.isDef()) {
949 // If the register is used by a Phi, then create an anti dependence.
951 UI = MRI.use_instr_begin(Reg),
952 UE = MRI.use_instr_end();
953 UI != UE; ++UI) {
954 MachineInstr *UseMI = &*UI;
955 SUnit *SU = getSUnit(UseMI);
956 if (SU != nullptr && UseMI->isPHI()) {
957 if (!MI->isPHI()) {
958 SDep Dep(SU, SDep::Anti, Reg);
959 Dep.setLatency(1);
960 I.addPred(Dep);
961 } else {
962 HasPhiDef = Reg;
963 // Add a chain edge to a dependent Phi that isn't an existing
964 // predecessor.
965 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
966 I.addPred(SDep(SU, SDep::Barrier));
967 }
968 }
969 }
970 } else if (MO.isUse()) {
971 // If the register is defined by a Phi, then create a true dependence.
973 if (DefMI == nullptr)
974 continue;
975 SUnit *SU = getSUnit(DefMI);
976 if (SU != nullptr && DefMI->isPHI()) {
977 if (!MI->isPHI()) {
978 SDep Dep(SU, SDep::Data, Reg);
979 Dep.setLatency(0);
980 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
981 &SchedModel);
982 I.addPred(Dep);
983 } else {
984 HasPhiUse = Reg;
985 // Add a chain edge to a dependent Phi that isn't an existing
986 // predecessor.
987 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
988 I.addPred(SDep(SU, SDep::Barrier));
989 }
990 }
991 }
992 }
993 // Remove order dependences from an unrelated Phi.
994 if (!SwpPruneDeps)
995 continue;
996 for (auto &PI : I.Preds) {
997 MachineInstr *PMI = PI.getSUnit()->getInstr();
998 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
999 if (I.getInstr()->isPHI()) {
1000 if (PMI->getOperand(0).getReg() == HasPhiUse)
1001 continue;
1002 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1003 continue;
1004 }
1005 RemoveDeps.push_back(PI);
1006 }
1007 }
1008 for (const SDep &D : RemoveDeps)
1009 I.removePred(D);
1010 }
1011}
1012
1013/// Iterate over each DAG node and see if we can change any dependences
1014/// in order to reduce the recurrence MII.
1015void SwingSchedulerDAG::changeDependences() {
1016 // See if an instruction can use a value from the previous iteration.
1017 // If so, we update the base and offset of the instruction and change
1018 // the dependences.
1019 for (SUnit &I : SUnits) {
1020 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1021 int64_t NewOffset = 0;
1022 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1023 NewOffset))
1024 continue;
1025
1026 // Get the MI and SUnit for the instruction that defines the original base.
1027 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1029 if (!DefMI)
1030 continue;
1031 SUnit *DefSU = getSUnit(DefMI);
1032 if (!DefSU)
1033 continue;
1034 // Get the MI and SUnit for the instruction that defins the new base.
1035 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1036 if (!LastMI)
1037 continue;
1038 SUnit *LastSU = getSUnit(LastMI);
1039 if (!LastSU)
1040 continue;
1041
1042 if (Topo.IsReachable(&I, LastSU))
1043 continue;
1044
1045 // Remove the dependence. The value now depends on a prior iteration.
1047 for (const SDep &P : I.Preds)
1048 if (P.getSUnit() == DefSU)
1049 Deps.push_back(P);
1050 for (const SDep &D : Deps) {
1051 Topo.RemovePred(&I, D.getSUnit());
1052 I.removePred(D);
1053 }
1054 // Remove the chain dependence between the instructions.
1055 Deps.clear();
1056 for (auto &P : LastSU->Preds)
1057 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1058 Deps.push_back(P);
1059 for (const SDep &D : Deps) {
1060 Topo.RemovePred(LastSU, D.getSUnit());
1061 LastSU->removePred(D);
1062 }
1063
1064 // Add a dependence between the new instruction and the instruction
1065 // that defines the new base.
1066 SDep Dep(&I, SDep::Anti, NewBase);
1067 Topo.AddPred(LastSU, &I);
1068 LastSU->addPred(Dep);
1069
1070 // Remember the base and offset information so that we can update the
1071 // instruction during code generation.
1072 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1073 }
1074}
1075
1076/// Create an instruction stream that represents a single iteration and stage of
1077/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1078/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1079/// function is an approximation of SMSchedule::finalizeSchedule with all
1080/// non-const operations removed.
1082 SMSchedule &Schedule,
1083 std::vector<MachineInstr *> &OrderedInsts,
1086
1087 // Move all instructions to the first stage from the later stages.
1088 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1089 ++Cycle) {
1090 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1091 Stage <= LastStage; ++Stage) {
1092 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1093 Cycle + Stage * Schedule.getInitiationInterval()))) {
1094 Instrs[Cycle].push_front(SU);
1095 }
1096 }
1097 }
1098
1099 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1100 ++Cycle) {
1101 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1102 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1103 for (SUnit *SU : CycleInstrs) {
1104 MachineInstr *MI = SU->getInstr();
1105 OrderedInsts.push_back(MI);
1106 Stages[MI] = Schedule.stageScheduled(SU);
1107 }
1108 }
1109}
1110
1111namespace {
1112
1113// FuncUnitSorter - Comparison operator used to sort instructions by
1114// the number of functional unit choices.
1115struct FuncUnitSorter {
1116 const InstrItineraryData *InstrItins;
1117 const MCSubtargetInfo *STI;
1119
1120 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1121 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1122
1123 // Compute the number of functional unit alternatives needed
1124 // at each stage, and take the minimum value. We prioritize the
1125 // instructions by the least number of choices first.
1126 unsigned minFuncUnits(const MachineInstr *Inst,
1127 InstrStage::FuncUnits &F) const {
1128 unsigned SchedClass = Inst->getDesc().getSchedClass();
1129 unsigned min = UINT_MAX;
1130 if (InstrItins && !InstrItins->isEmpty()) {
1131 for (const InstrStage &IS :
1132 make_range(InstrItins->beginStage(SchedClass),
1133 InstrItins->endStage(SchedClass))) {
1134 InstrStage::FuncUnits funcUnits = IS.getUnits();
1135 unsigned numAlternatives = llvm::popcount(funcUnits);
1136 if (numAlternatives < min) {
1137 min = numAlternatives;
1138 F = funcUnits;
1139 }
1140 }
1141 return min;
1142 }
1143 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1144 const MCSchedClassDesc *SCDesc =
1145 STI->getSchedModel().getSchedClassDesc(SchedClass);
1146 if (!SCDesc->isValid())
1147 // No valid Schedule Class Desc for schedClass, should be
1148 // Pseudo/PostRAPseudo
1149 return min;
1150
1151 for (const MCWriteProcResEntry &PRE :
1152 make_range(STI->getWriteProcResBegin(SCDesc),
1153 STI->getWriteProcResEnd(SCDesc))) {
1154 if (!PRE.ReleaseAtCycle)
1155 continue;
1156 const MCProcResourceDesc *ProcResource =
1157 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1158 unsigned NumUnits = ProcResource->NumUnits;
1159 if (NumUnits < min) {
1160 min = NumUnits;
1161 F = PRE.ProcResourceIdx;
1162 }
1163 }
1164 return min;
1165 }
1166 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1167 }
1168
1169 // Compute the critical resources needed by the instruction. This
1170 // function records the functional units needed by instructions that
1171 // must use only one functional unit. We use this as a tie breaker
1172 // for computing the resource MII. The instrutions that require
1173 // the same, highly used, functional unit have high priority.
1174 void calcCriticalResources(MachineInstr &MI) {
1175 unsigned SchedClass = MI.getDesc().getSchedClass();
1176 if (InstrItins && !InstrItins->isEmpty()) {
1177 for (const InstrStage &IS :
1178 make_range(InstrItins->beginStage(SchedClass),
1179 InstrItins->endStage(SchedClass))) {
1180 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1181 if (llvm::popcount(FuncUnits) == 1)
1182 Resources[FuncUnits]++;
1183 }
1184 return;
1185 }
1186 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1187 const MCSchedClassDesc *SCDesc =
1188 STI->getSchedModel().getSchedClassDesc(SchedClass);
1189 if (!SCDesc->isValid())
1190 // No valid Schedule Class Desc for schedClass, should be
1191 // Pseudo/PostRAPseudo
1192 return;
1193
1194 for (const MCWriteProcResEntry &PRE :
1195 make_range(STI->getWriteProcResBegin(SCDesc),
1196 STI->getWriteProcResEnd(SCDesc))) {
1197 if (!PRE.ReleaseAtCycle)
1198 continue;
1199 Resources[PRE.ProcResourceIdx]++;
1200 }
1201 return;
1202 }
1203 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1204 }
1205
1206 /// Return true if IS1 has less priority than IS2.
1207 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1208 InstrStage::FuncUnits F1 = 0, F2 = 0;
1209 unsigned MFUs1 = minFuncUnits(IS1, F1);
1210 unsigned MFUs2 = minFuncUnits(IS2, F2);
1211 if (MFUs1 == MFUs2)
1212 return Resources.lookup(F1) < Resources.lookup(F2);
1213 return MFUs1 > MFUs2;
1214 }
1215};
1216
1217/// Calculate the maximum register pressure of the scheduled instructions stream
1218class HighRegisterPressureDetector {
1219 MachineBasicBlock *OrigMBB;
1220 const MachineRegisterInfo &MRI;
1221 const TargetRegisterInfo *TRI;
1222
1223 const unsigned PSetNum;
1224
1225 // Indexed by PSet ID
1226 // InitSetPressure takes into account the register pressure of live-in
1227 // registers. It's not depend on how the loop is scheduled, so it's enough to
1228 // calculate them once at the beginning.
1229 std::vector<unsigned> InitSetPressure;
1230
1231 // Indexed by PSet ID
1232 // Upper limit for each register pressure set
1233 std::vector<unsigned> PressureSetLimit;
1234
1236
1238
1239public:
1240 using OrderedInstsTy = std::vector<MachineInstr *>;
1241 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1242
1243private:
1244 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1245 if (Pressures.size() == 0) {
1246 dbgs() << "[]";
1247 } else {
1248 char Prefix = '[';
1249 for (unsigned P : Pressures) {
1250 dbgs() << Prefix << P;
1251 Prefix = ' ';
1252 }
1253 dbgs() << ']';
1254 }
1255 }
1256
1257 void dumpPSet(Register Reg) const {
1258 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1259 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1260 ++PSetIter) {
1261 dbgs() << *PSetIter << ' ';
1262 }
1263 dbgs() << '\n';
1264 }
1265
1266 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1267 Register Reg) const {
1268 auto PSetIter = MRI.getPressureSets(Reg);
1269 unsigned Weight = PSetIter.getWeight();
1270 for (; PSetIter.isValid(); ++PSetIter)
1271 Pressure[*PSetIter] += Weight;
1272 }
1273
1274 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1275 Register Reg) const {
1276 auto PSetIter = MRI.getPressureSets(Reg);
1277 unsigned Weight = PSetIter.getWeight();
1278 for (; PSetIter.isValid(); ++PSetIter) {
1279 auto &P = Pressure[*PSetIter];
1280 assert(P >= Weight &&
1281 "register pressure must be greater than or equal weight");
1282 P -= Weight;
1283 }
1284 }
1285
1286 // Return true if Reg is reserved one, for example, stack pointer
1287 bool isReservedRegister(Register Reg) const {
1288 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1289 }
1290
1291 bool isDefinedInThisLoop(Register Reg) const {
1292 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1293 }
1294
1295 // Search for live-in variables. They are factored into the register pressure
1296 // from the begining. Live-in variables used by every iteration should be
1297 // considered as alive throughout the loop. For example, the variable `c` in
1298 // following code. \code
1299 // int c = ...;
1300 // for (int i = 0; i < n; i++)
1301 // a[i] += b[i] + c;
1302 // \endcode
1303 void computeLiveIn() {
1305 for (auto &MI : *OrigMBB) {
1306 if (MI.isDebugInstr())
1307 continue;
1308 for (auto &Use : ROMap[&MI].Uses) {
1309 auto Reg = Use.RegUnit;
1310 // Ignore the variable that appears only on one side of phi instruction
1311 // because it's used only at the first iteration.
1312 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1313 continue;
1314 if (isReservedRegister(Reg))
1315 continue;
1316 if (isDefinedInThisLoop(Reg))
1317 continue;
1318 Used.insert(Reg);
1319 }
1320 }
1321
1322 for (auto LiveIn : Used)
1323 increaseRegisterPressure(InitSetPressure, LiveIn);
1324 }
1325
1326 // Calculate the upper limit of each pressure set
1327 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1328 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1329 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1330 }
1331
1332 // There are two patterns of last-use.
1333 // - by an instruction of the current iteration
1334 // - by a phi instruction of the next iteration (loop carried value)
1335 //
1336 // Furthermore, following two groups of instructions are executed
1337 // simultaneously
1338 // - next iteration's phi instructions in i-th stage
1339 // - current iteration's instructions in i+1-th stage
1340 //
1341 // This function calculates the last-use of each register while taking into
1342 // account the above two patterns.
1343 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1344 Instr2StageTy &Stages) const {
1345 // We treat virtual registers that are defined and used in this loop.
1346 // Following virtual register will be ignored
1347 // - live-in one
1348 // - defined but not used in the loop (potentially live-out)
1349 DenseSet<Register> TargetRegs;
1350 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1351 if (isDefinedInThisLoop(Reg))
1352 TargetRegs.insert(Reg);
1353 };
1354 for (MachineInstr *MI : OrderedInsts) {
1355 if (MI->isPHI()) {
1356 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1357 UpdateTargetRegs(Reg);
1358 } else {
1359 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1360 UpdateTargetRegs(Use.RegUnit);
1361 }
1362 }
1363
1364 const auto InstrScore = [&Stages](MachineInstr *MI) {
1365 return Stages[MI] + MI->isPHI();
1366 };
1367
1369 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1370 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1371 auto Reg = Use.RegUnit;
1372 if (!TargetRegs.contains(Reg))
1373 continue;
1374 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1375 if (!Inserted) {
1376 MachineInstr *Orig = Ite->second;
1377 MachineInstr *New = MI;
1378 if (InstrScore(Orig) < InstrScore(New))
1379 Ite->second = New;
1380 }
1381 }
1382 }
1383
1384 Instr2LastUsesTy LastUses;
1385 for (auto &Entry : LastUseMI)
1386 LastUses[Entry.second].insert(Entry.first);
1387 return LastUses;
1388 }
1389
1390 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1391 // iterations and check the register pressure at the point where all stages
1392 // overlapping.
1393 //
1394 // An example of unrolled loop where #Stage is 4..
1395 // Iter i+0 i+1 i+2 i+3
1396 // ------------------------
1397 // Stage 0
1398 // Stage 1 0
1399 // Stage 2 1 0
1400 // Stage 3 2 1 0 <- All stages overlap
1401 //
1402 std::vector<unsigned>
1403 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1404 Instr2StageTy &Stages,
1405 const unsigned StageCount) const {
1406 using RegSetTy = SmallDenseSet<Register, 16>;
1407
1408 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1409 // manage the liveness of the registers independently by iterations.
1410 SmallVector<RegSetTy> LiveRegSets(StageCount);
1411
1412 auto CurSetPressure = InitSetPressure;
1413 auto MaxSetPressure = InitSetPressure;
1414 auto LastUses = computeLastUses(OrderedInsts, Stages);
1415
1416 LLVM_DEBUG({
1417 dbgs() << "Ordered instructions:\n";
1418 for (MachineInstr *MI : OrderedInsts) {
1419 dbgs() << "Stage " << Stages[MI] << ": ";
1420 MI->dump();
1421 }
1422 });
1423
1424 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1425 Register Reg) {
1426 if (!Reg.isValid() || isReservedRegister(Reg))
1427 return;
1428
1429 bool Inserted = RegSet.insert(Reg).second;
1430 if (!Inserted)
1431 return;
1432
1433 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1434 increaseRegisterPressure(CurSetPressure, Reg);
1435 LLVM_DEBUG(dumpPSet(Reg));
1436 };
1437
1438 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1439 Register Reg) {
1440 if (!Reg.isValid() || isReservedRegister(Reg))
1441 return;
1442
1443 // live-in register
1444 if (!RegSet.contains(Reg))
1445 return;
1446
1447 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1448 RegSet.erase(Reg);
1449 decreaseRegisterPressure(CurSetPressure, Reg);
1450 LLVM_DEBUG(dumpPSet(Reg));
1451 };
1452
1453 for (unsigned I = 0; I < StageCount; I++) {
1454 for (MachineInstr *MI : OrderedInsts) {
1455 const auto Stage = Stages[MI];
1456 if (I < Stage)
1457 continue;
1458
1459 const unsigned Iter = I - Stage;
1460
1461 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1462 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1463
1464 for (auto LastUse : LastUses[MI]) {
1465 if (MI->isPHI()) {
1466 if (Iter != 0)
1467 EraseReg(LiveRegSets[Iter - 1], LastUse);
1468 } else {
1469 EraseReg(LiveRegSets[Iter], LastUse);
1470 }
1471 }
1472
1473 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1474 MaxSetPressure[PSet] =
1475 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1476
1477 LLVM_DEBUG({
1478 dbgs() << "CurSetPressure=";
1479 dumpRegisterPressures(CurSetPressure);
1480 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1481 MI->dump();
1482 });
1483 }
1484 }
1485
1486 return MaxSetPressure;
1487 }
1488
1489public:
1490 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1491 const MachineFunction &MF)
1492 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1493 TRI(MF.getSubtarget().getRegisterInfo()),
1494 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1495 PressureSetLimit(PSetNum, 0) {}
1496
1497 // Used to calculate register pressure, which is independent of loop
1498 // scheduling.
1499 void init(const RegisterClassInfo &RCI) {
1500 for (MachineInstr &MI : *OrigMBB) {
1501 if (MI.isDebugInstr())
1502 continue;
1503 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1504 }
1505
1506 computeLiveIn();
1507 computePressureSetLimit(RCI);
1508 }
1509
1510 // Calculate the maximum register pressures of the loop and check if they
1511 // exceed the limit
1512 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1513 const unsigned MaxStage) const {
1515 "the percentage of the margin must be between 0 to 100");
1516
1517 OrderedInstsTy OrderedInsts;
1518 Instr2StageTy Stages;
1519 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1520 const auto MaxSetPressure =
1521 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1522
1523 LLVM_DEBUG({
1524 dbgs() << "Dump MaxSetPressure:\n";
1525 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1526 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1527 }
1528 dbgs() << '\n';
1529 });
1530
1531 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1532 unsigned Limit = PressureSetLimit[PSet];
1533 unsigned Margin = Limit * RegPressureMargin / 100;
1534 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1535 << " Margin=" << Margin << "\n");
1536 if (Limit < MaxSetPressure[PSet] + Margin) {
1537 LLVM_DEBUG(
1538 dbgs()
1539 << "Rejected the schedule because of too high register pressure\n");
1540 return true;
1541 }
1542 }
1543 return false;
1544 }
1545};
1546
1547} // end anonymous namespace
1548
1549/// Calculate the resource constrained minimum initiation interval for the
1550/// specified loop. We use the DFA to model the resources needed for
1551/// each instruction, and we ignore dependences. A different DFA is created
1552/// for each cycle that is required. When adding a new instruction, we attempt
1553/// to add it to each existing DFA, until a legal space is found. If the
1554/// instruction cannot be reserved in an existing DFA, we create a new one.
1555unsigned SwingSchedulerDAG::calculateResMII() {
1556 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1558 return RM.calculateResMII();
1559}
1560
1561/// Calculate the recurrence-constrainted minimum initiation interval.
1562/// Iterate over each circuit. Compute the delay(c) and distance(c)
1563/// for each circuit. The II needs to satisfy the inequality
1564/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1565/// II that satisfies the inequality, and the RecMII is the maximum
1566/// of those values.
1567unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1568 unsigned RecMII = 0;
1569
1570 for (NodeSet &Nodes : NodeSets) {
1571 if (Nodes.empty())
1572 continue;
1573
1574 unsigned Delay = Nodes.getLatency();
1575 unsigned Distance = 1;
1576
1577 // ii = ceil(delay / distance)
1578 unsigned CurMII = (Delay + Distance - 1) / Distance;
1579 Nodes.setRecMII(CurMII);
1580 if (CurMII > RecMII)
1581 RecMII = CurMII;
1582 }
1583
1584 return RecMII;
1585}
1586
1587/// Create the adjacency structure of the nodes in the graph.
1588void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1589 SwingSchedulerDAG *DAG) {
1590 BitVector Added(SUnits.size());
1591 DenseMap<int, int> OutputDeps;
1592 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1593 Added.reset();
1594 // Add any successor to the adjacency matrix and exclude duplicates.
1595 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1596 // Only create a back-edge on the first and last nodes of a dependence
1597 // chain. This records any chains and adds them later.
1598 if (OE.isOutputDep()) {
1599 int N = OE.getDst()->NodeNum;
1600 int BackEdge = i;
1601 auto Dep = OutputDeps.find(BackEdge);
1602 if (Dep != OutputDeps.end()) {
1603 BackEdge = Dep->second;
1604 OutputDeps.erase(Dep);
1605 }
1606 OutputDeps[N] = BackEdge;
1607 }
1608 // Do not process a boundary node, an artificial node.
1609 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1610 continue;
1611
1612 // This code is retained o preserve previous behavior and prevent
1613 // regression. This condition means that anti-dependnecies within an
1614 // iteration are ignored when searching circuits. Therefore it's natural
1615 // to consider this dependence as well.
1616 // FIXME: Remove this code if it doesn't have significant impact on
1617 // performance.
1618 if (OE.isAntiDep())
1619 continue;
1620
1621 int N = OE.getDst()->NodeNum;
1622 if (!Added.test(N)) {
1623 AdjK[i].push_back(N);
1624 Added.set(N);
1625 }
1626 }
1627 // A chain edge between a store and a load is treated as a back-edge in the
1628 // adjacency matrix.
1629 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1630 SUnit *Src = IE.getSrc();
1631 SUnit *Dst = IE.getDst();
1632 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1633 continue;
1634 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1635 int N = Src->NodeNum;
1636 if (!Added.test(N)) {
1637 AdjK[i].push_back(N);
1638 Added.set(N);
1639 }
1640 }
1641 }
1642 }
1643 // Add back-edges in the adjacency matrix for the output dependences.
1644 for (auto &OD : OutputDeps)
1645 if (!Added.test(OD.second)) {
1646 AdjK[OD.first].push_back(OD.second);
1647 Added.set(OD.second);
1648 }
1649}
1650
1651/// Identify an elementary circuit in the dependence graph starting at the
1652/// specified node.
1653bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1654 const SwingSchedulerDAG *DAG,
1655 bool HasBackedge) {
1656 SUnit *SV = &SUnits[V];
1657 bool F = false;
1658 Stack.insert(SV);
1659 Blocked.set(V);
1660
1661 for (auto W : AdjK[V]) {
1662 if (NumPaths > MaxPaths)
1663 break;
1664 if (W < S)
1665 continue;
1666 if (W == S) {
1667 if (!HasBackedge)
1668 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1669 F = true;
1670 ++NumPaths;
1671 break;
1672 }
1673 if (!Blocked.test(W)) {
1674 if (circuit(W, S, NodeSets, DAG,
1675 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1676 F = true;
1677 }
1678 }
1679
1680 if (F)
1681 unblock(V);
1682 else {
1683 for (auto W : AdjK[V]) {
1684 if (W < S)
1685 continue;
1686 B[W].insert(SV);
1687 }
1688 }
1689 Stack.pop_back();
1690 return F;
1691}
1692
1693/// Unblock a node in the circuit finding algorithm.
1694void SwingSchedulerDAG::Circuits::unblock(int U) {
1695 Blocked.reset(U);
1697 while (!BU.empty()) {
1699 assert(SI != BU.end() && "Invalid B set.");
1700 SUnit *W = *SI;
1701 BU.erase(W);
1702 if (Blocked.test(W->NodeNum))
1703 unblock(W->NodeNum);
1704 }
1705}
1706
1707/// Identify all the elementary circuits in the dependence graph using
1708/// Johnson's circuit algorithm.
1709void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1710 Circuits Cir(SUnits, Topo);
1711 // Create the adjacency structure.
1712 Cir.createAdjacencyStructure(this);
1713 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1714 Cir.reset();
1715 Cir.circuit(I, I, NodeSets, this);
1716 }
1717}
1718
1719// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721// additional copies that are needed across iterations. An artificial dependence
1722// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723
1724// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726// PHI-------True-Dep------> USEOfPhi
1727
1728// The mutation creates
1729// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730
1731// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733// late to avoid additional copies across iterations. The possible scheduling
1734// order would be
1735// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1736
1737void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738 for (SUnit &SU : DAG->SUnits) {
1739 // Find the COPY/REG_SEQUENCE instruction.
1740 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741 continue;
1742
1743 // Record the loop carried PHIs.
1745 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1747
1748 for (auto &Dep : SU.Preds) {
1749 SUnit *TmpSU = Dep.getSUnit();
1750 MachineInstr *TmpMI = TmpSU->getInstr();
1751 SDep::Kind DepKind = Dep.getKind();
1752 // Save the loop carried PHI.
1753 if (DepKind == SDep::Anti && TmpMI->isPHI())
1754 PHISUs.push_back(TmpSU);
1755 // Save the source of COPY/REG_SEQUENCE.
1756 // If the source has no pre-decessors, we will end up creating cycles.
1757 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758 SrcSUs.push_back(TmpSU);
1759 }
1760
1761 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762 continue;
1763
1764 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765 // SUnit to the container.
1767 // Do not use iterator based loop here as we are updating the container.
1768 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769 for (auto &Dep : PHISUs[Index]->Succs) {
1770 if (Dep.getKind() != SDep::Data)
1771 continue;
1772
1773 SUnit *TmpSU = Dep.getSUnit();
1774 MachineInstr *TmpMI = TmpSU->getInstr();
1775 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776 PHISUs.push_back(TmpSU);
1777 continue;
1778 }
1779 UseSUs.push_back(TmpSU);
1780 }
1781 }
1782
1783 if (UseSUs.size() == 0)
1784 continue;
1785
1786 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787 // Add the artificial dependencies if it does not form a cycle.
1788 for (auto *I : UseSUs) {
1789 for (auto *Src : SrcSUs) {
1790 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791 Src->addPred(SDep(I, SDep::Artificial));
1792 SDAG->Topo.AddPred(Src, I);
1793 }
1794 }
1795 }
1796 }
1797}
1798
1799/// Compute several functions need to order the nodes for scheduling.
1800/// ASAP - Earliest time to schedule a node.
1801/// ALAP - Latest time to schedule a node.
1802/// MOV - Mobility function, difference between ALAP and ASAP.
1803/// D - Depth of each node.
1804/// H - Height of each node.
1805void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1806 ScheduleInfo.resize(SUnits.size());
1807
1808 LLVM_DEBUG({
1809 for (int I : Topo) {
1810 const SUnit &SU = SUnits[I];
1811 dumpNode(SU);
1812 }
1813 });
1814
1815 int maxASAP = 0;
1816 // Compute ASAP and ZeroLatencyDepth.
1817 for (int I : Topo) {
1818 int asap = 0;
1819 int zeroLatencyDepth = 0;
1820 SUnit *SU = &SUnits[I];
1821 for (const auto &IE : DDG->getInEdges(SU)) {
1822 SUnit *Pred = IE.getSrc();
1823 if (IE.getLatency() == 0)
1824 zeroLatencyDepth =
1825 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
1826 if (IE.ignoreDependence(true))
1827 continue;
1828 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
1829 IE.getDistance() * MII));
1830 }
1831 maxASAP = std::max(maxASAP, asap);
1832 ScheduleInfo[I].ASAP = asap;
1833 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1834 }
1835
1836 // Compute ALAP, ZeroLatencyHeight, and MOV.
1837 for (int I : llvm::reverse(Topo)) {
1838 int alap = maxASAP;
1839 int zeroLatencyHeight = 0;
1840 SUnit *SU = &SUnits[I];
1841 for (const auto &OE : DDG->getOutEdges(SU)) {
1842 SUnit *Succ = OE.getDst();
1843 if (Succ->isBoundaryNode())
1844 continue;
1845 if (OE.getLatency() == 0)
1846 zeroLatencyHeight =
1847 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
1848 if (OE.ignoreDependence(true))
1849 continue;
1850 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
1851 OE.getDistance() * MII));
1852 }
1853
1854 ScheduleInfo[I].ALAP = alap;
1855 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1856 }
1857
1858 // After computing the node functions, compute the summary for each node set.
1859 for (NodeSet &I : NodeSets)
1860 I.computeNodeSetInfo(this);
1861
1862 LLVM_DEBUG({
1863 for (unsigned i = 0; i < SUnits.size(); i++) {
1864 dbgs() << "\tNode " << i << ":\n";
1865 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1866 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1867 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1868 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1869 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1870 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1871 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1872 }
1873 });
1874}
1875
1876/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1877/// as the predecessors of the elements of NodeOrder that are not also in
1878/// NodeOrder.
1881 const NodeSet *S = nullptr) {
1882 Preds.clear();
1883
1884 for (SUnit *SU : NodeOrder) {
1885 for (const auto &IE : DDG->getInEdges(SU)) {
1886 SUnit *PredSU = IE.getSrc();
1887 if (S && S->count(PredSU) == 0)
1888 continue;
1889 if (IE.ignoreDependence(true))
1890 continue;
1891 if (NodeOrder.count(PredSU) == 0)
1892 Preds.insert(PredSU);
1893 }
1894
1895 // FIXME: The following loop-carried dependencies may also need to be
1896 // considered.
1897 // - Physical register dependencies (true-dependence and WAW).
1898 // - Memory dependencies.
1899 for (const auto &OE : DDG->getOutEdges(SU)) {
1900 SUnit *SuccSU = OE.getDst();
1901 if (!OE.isAntiDep())
1902 continue;
1903 if (S && S->count(SuccSU) == 0)
1904 continue;
1905 if (NodeOrder.count(SuccSU) == 0)
1906 Preds.insert(SuccSU);
1907 }
1908 }
1909 return !Preds.empty();
1910}
1911
1912/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1913/// as the successors of the elements of NodeOrder that are not also in
1914/// NodeOrder.
1917 const NodeSet *S = nullptr) {
1918 Succs.clear();
1919
1920 for (SUnit *SU : NodeOrder) {
1921 for (const auto &OE : DDG->getOutEdges(SU)) {
1922 SUnit *SuccSU = OE.getDst();
1923 if (S && S->count(SuccSU) == 0)
1924 continue;
1925 if (OE.ignoreDependence(false))
1926 continue;
1927 if (NodeOrder.count(SuccSU) == 0)
1928 Succs.insert(SuccSU);
1929 }
1930
1931 // FIXME: The following loop-carried dependencies may also need to be
1932 // considered.
1933 // - Physical register dependnecies (true-dependnece and WAW).
1934 // - Memory dependencies.
1935 for (const auto &IE : DDG->getInEdges(SU)) {
1936 SUnit *PredSU = IE.getSrc();
1937 if (!IE.isAntiDep())
1938 continue;
1939 if (S && S->count(PredSU) == 0)
1940 continue;
1941 if (NodeOrder.count(PredSU) == 0)
1942 Succs.insert(PredSU);
1943 }
1944 }
1945 return !Succs.empty();
1946}
1947
1948/// Return true if there is a path from the specified node to any of the nodes
1949/// in DestNodes. Keep track and return the nodes in any path.
1950static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1951 SetVector<SUnit *> &DestNodes,
1952 SetVector<SUnit *> &Exclude,
1953 SmallPtrSet<SUnit *, 8> &Visited,
1954 SwingSchedulerDDG *DDG) {
1955 if (Cur->isBoundaryNode())
1956 return false;
1957 if (Exclude.contains(Cur))
1958 return false;
1959 if (DestNodes.contains(Cur))
1960 return true;
1961 if (!Visited.insert(Cur).second)
1962 return Path.contains(Cur);
1963 bool FoundPath = false;
1964 for (const auto &OE : DDG->getOutEdges(Cur))
1965 if (!OE.ignoreDependence(false))
1966 FoundPath |=
1967 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
1968 for (const auto &IE : DDG->getInEdges(Cur))
1969 if (IE.isAntiDep() && IE.getDistance() == 0)
1970 FoundPath |=
1971 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
1972 if (FoundPath)
1973 Path.insert(Cur);
1974 return FoundPath;
1975}
1976
1977/// Compute the live-out registers for the instructions in a node-set.
1978/// The live-out registers are those that are defined in the node-set,
1979/// but not used. Except for use operands of Phis.
1981 NodeSet &NS) {
1986 for (SUnit *SU : NS) {
1987 const MachineInstr *MI = SU->getInstr();
1988 if (MI->isPHI())
1989 continue;
1990 for (const MachineOperand &MO : MI->all_uses()) {
1991 Register Reg = MO.getReg();
1992 if (Reg.isVirtual())
1993 Uses.insert(Reg);
1994 else if (MRI.isAllocatable(Reg))
1995 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1996 Uses.insert(Unit);
1997 }
1998 }
1999 for (SUnit *SU : NS)
2000 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2001 if (!MO.isDead()) {
2002 Register Reg = MO.getReg();
2003 if (Reg.isVirtual()) {
2004 if (!Uses.count(Reg))
2005 LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());
2006 } else if (MRI.isAllocatable(Reg)) {
2007 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2008 if (!Uses.count(Unit))
2009 LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());
2010 }
2011 }
2012 RPTracker.addLiveRegs(LiveOutRegs);
2013}
2014
2015/// A heuristic to filter nodes in recurrent node-sets if the register
2016/// pressure of a set is too high.
2017void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2018 for (auto &NS : NodeSets) {
2019 // Skip small node-sets since they won't cause register pressure problems.
2020 if (NS.size() <= 2)
2021 continue;
2022 IntervalPressure RecRegPressure;
2023 RegPressureTracker RecRPTracker(RecRegPressure);
2024 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2025 computeLiveOuts(MF, RecRPTracker, NS);
2026 RecRPTracker.closeBottom();
2027
2028 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2029 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2030 return A->NodeNum > B->NodeNum;
2031 });
2032
2033 for (auto &SU : SUnits) {
2034 // Since we're computing the register pressure for a subset of the
2035 // instructions in a block, we need to set the tracker for each
2036 // instruction in the node-set. The tracker is set to the instruction
2037 // just after the one we're interested in.
2039 RecRPTracker.setPos(std::next(CurInstI));
2040
2041 RegPressureDelta RPDelta;
2042 ArrayRef<PressureChange> CriticalPSets;
2043 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2044 CriticalPSets,
2045 RecRegPressure.MaxSetPressure);
2046 if (RPDelta.Excess.isValid()) {
2047 LLVM_DEBUG(
2048 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2050 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2051 NS.setExceedPressure(SU);
2052 break;
2053 }
2054 RecRPTracker.recede();
2055 }
2056 }
2057}
2058
2059/// A heuristic to colocate node sets that have the same set of
2060/// successors.
2061void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2062 unsigned Colocate = 0;
2063 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2064 NodeSet &N1 = NodeSets[i];
2066 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2067 continue;
2068 for (int j = i + 1; j < e; ++j) {
2069 NodeSet &N2 = NodeSets[j];
2070 if (N1.compareRecMII(N2) != 0)
2071 continue;
2073 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2074 continue;
2075 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2076 N1.setColocate(++Colocate);
2077 N2.setColocate(Colocate);
2078 break;
2079 }
2080 }
2081 }
2082}
2083
2084/// Check if the existing node-sets are profitable. If not, then ignore the
2085/// recurrent node-sets, and attempt to schedule all nodes together. This is
2086/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2087/// then it's best to try to schedule all instructions together instead of
2088/// starting with the recurrent node-sets.
2089void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2090 // Look for loops with a large MII.
2091 if (MII < 17)
2092 return;
2093 // Check if the node-set contains only a simple add recurrence.
2094 for (auto &NS : NodeSets) {
2095 if (NS.getRecMII() > 2)
2096 return;
2097 if (NS.getMaxDepth() > MII)
2098 return;
2099 }
2100 NodeSets.clear();
2101 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2102}
2103
2104/// Add the nodes that do not belong to a recurrence set into groups
2105/// based upon connected components.
2106void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2107 SetVector<SUnit *> NodesAdded;
2109 // Add the nodes that are on a path between the previous node sets and
2110 // the current node set.
2111 for (NodeSet &I : NodeSets) {
2113 // Add the nodes from the current node set to the previous node set.
2114 if (succ_L(I, N, DDG.get())) {
2116 for (SUnit *NI : N) {
2117 Visited.clear();
2118 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2119 }
2120 if (!Path.empty())
2121 I.insert(Path.begin(), Path.end());
2122 }
2123 // Add the nodes from the previous node set to the current node set.
2124 N.clear();
2125 if (succ_L(NodesAdded, N, DDG.get())) {
2127 for (SUnit *NI : N) {
2128 Visited.clear();
2129 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2130 }
2131 if (!Path.empty())
2132 I.insert(Path.begin(), Path.end());
2133 }
2134 NodesAdded.insert(I.begin(), I.end());
2135 }
2136
2137 // Create a new node set with the connected nodes of any successor of a node
2138 // in a recurrent set.
2139 NodeSet NewSet;
2141 if (succ_L(NodesAdded, N, DDG.get()))
2142 for (SUnit *I : N)
2143 addConnectedNodes(I, NewSet, NodesAdded);
2144 if (!NewSet.empty())
2145 NodeSets.push_back(NewSet);
2146
2147 // Create a new node set with the connected nodes of any predecessor of a node
2148 // in a recurrent set.
2149 NewSet.clear();
2150 if (pred_L(NodesAdded, N, DDG.get()))
2151 for (SUnit *I : N)
2152 addConnectedNodes(I, NewSet, NodesAdded);
2153 if (!NewSet.empty())
2154 NodeSets.push_back(NewSet);
2155
2156 // Create new nodes sets with the connected nodes any remaining node that
2157 // has no predecessor.
2158 for (SUnit &SU : SUnits) {
2159 if (NodesAdded.count(&SU) == 0) {
2160 NewSet.clear();
2161 addConnectedNodes(&SU, NewSet, NodesAdded);
2162 if (!NewSet.empty())
2163 NodeSets.push_back(NewSet);
2164 }
2165 }
2166}
2167
2168/// Add the node to the set, and add all of its connected nodes to the set.
2169void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2170 SetVector<SUnit *> &NodesAdded) {
2171 NewSet.insert(SU);
2172 NodesAdded.insert(SU);
2173 for (auto &OE : DDG->getOutEdges(SU)) {
2174 SUnit *Successor = OE.getDst();
2175 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2176 NodesAdded.count(Successor) == 0)
2177 addConnectedNodes(Successor, NewSet, NodesAdded);
2178 }
2179 for (auto &IE : DDG->getInEdges(SU)) {
2180 SUnit *Predecessor = IE.getSrc();
2181 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2182 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2183 }
2184}
2185
2186/// Return true if Set1 contains elements in Set2. The elements in common
2187/// are returned in a different container.
2188static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2190 Result.clear();
2191 for (SUnit *SU : Set1) {
2192 if (Set2.count(SU) != 0)
2193 Result.insert(SU);
2194 }
2195 return !Result.empty();
2196}
2197
2198/// Merge the recurrence node sets that have the same initial node.
2199void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2200 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2201 ++I) {
2202 NodeSet &NI = *I;
2203 for (NodeSetType::iterator J = I + 1; J != E;) {
2204 NodeSet &NJ = *J;
2205 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2206 if (NJ.compareRecMII(NI) > 0)
2207 NI.setRecMII(NJ.getRecMII());
2208 for (SUnit *SU : *J)
2209 I->insert(SU);
2210 NodeSets.erase(J);
2211 E = NodeSets.end();
2212 } else {
2213 ++J;
2214 }
2215 }
2216 }
2217}
2218
2219/// Remove nodes that have been scheduled in previous NodeSets.
2220void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2221 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2222 ++I)
2223 for (NodeSetType::iterator J = I + 1; J != E;) {
2224 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2225
2226 if (J->empty()) {
2227 NodeSets.erase(J);
2228 E = NodeSets.end();
2229 } else {
2230 ++J;
2231 }
2232 }
2233}
2234
2235/// Compute an ordered list of the dependence graph nodes, which
2236/// indicates the order that the nodes will be scheduled. This is a
2237/// two-level algorithm. First, a partial order is created, which
2238/// consists of a list of sets ordered from highest to lowest priority.
2239void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2241 NodeOrder.clear();
2242
2243 for (auto &Nodes : NodeSets) {
2244 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2245 OrderKind Order;
2247 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2248 R.insert(N.begin(), N.end());
2249 Order = BottomUp;
2250 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2251 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2252 llvm::set_is_subset(N, Nodes)) {
2253 R.insert(N.begin(), N.end());
2254 Order = TopDown;
2255 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2256 } else if (isIntersect(N, Nodes, R)) {
2257 // If some of the successors are in the existing node-set, then use the
2258 // top-down ordering.
2259 Order = TopDown;
2260 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2261 } else if (NodeSets.size() == 1) {
2262 for (const auto &N : Nodes)
2263 if (N->Succs.size() == 0)
2264 R.insert(N);
2265 Order = BottomUp;
2266 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2267 } else {
2268 // Find the node with the highest ASAP.
2269 SUnit *maxASAP = nullptr;
2270 for (SUnit *SU : Nodes) {
2271 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2272 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2273 maxASAP = SU;
2274 }
2275 R.insert(maxASAP);
2276 Order = BottomUp;
2277 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2278 }
2279
2280 while (!R.empty()) {
2281 if (Order == TopDown) {
2282 // Choose the node with the maximum height. If more than one, choose
2283 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2284 // choose the node with the lowest MOV.
2285 while (!R.empty()) {
2286 SUnit *maxHeight = nullptr;
2287 for (SUnit *I : R) {
2288 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2289 maxHeight = I;
2290 else if (getHeight(I) == getHeight(maxHeight) &&
2292 maxHeight = I;
2293 else if (getHeight(I) == getHeight(maxHeight) &&
2295 getZeroLatencyHeight(maxHeight) &&
2296 getMOV(I) < getMOV(maxHeight))
2297 maxHeight = I;
2298 }
2299 NodeOrder.insert(maxHeight);
2300 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2301 R.remove(maxHeight);
2302 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2303 SUnit *SU = OE.getDst();
2304 if (Nodes.count(SU) == 0)
2305 continue;
2306 if (NodeOrder.contains(SU))
2307 continue;
2308 if (OE.ignoreDependence(false))
2309 continue;
2310 R.insert(SU);
2311 }
2312
2313 // FIXME: The following loop-carried dependencies may also need to be
2314 // considered.
2315 // - Physical register dependnecies (true-dependnece and WAW).
2316 // - Memory dependencies.
2317 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2318 SUnit *SU = IE.getSrc();
2319 if (!IE.isAntiDep())
2320 continue;
2321 if (Nodes.count(SU) == 0)
2322 continue;
2323 if (NodeOrder.contains(SU))
2324 continue;
2325 R.insert(SU);
2326 }
2327 }
2328 Order = BottomUp;
2329 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2331 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2332 R.insert(N.begin(), N.end());
2333 } else {
2334 // Choose the node with the maximum depth. If more than one, choose
2335 // the node with the maximum ZeroLatencyDepth. If still more than one,
2336 // choose the node with the lowest MOV.
2337 while (!R.empty()) {
2338 SUnit *maxDepth = nullptr;
2339 for (SUnit *I : R) {
2340 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2341 maxDepth = I;
2342 else if (getDepth(I) == getDepth(maxDepth) &&
2344 maxDepth = I;
2345 else if (getDepth(I) == getDepth(maxDepth) &&
2347 getMOV(I) < getMOV(maxDepth))
2348 maxDepth = I;
2349 }
2350 NodeOrder.insert(maxDepth);
2351 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2352 R.remove(maxDepth);
2353 if (Nodes.isExceedSU(maxDepth)) {
2354 Order = TopDown;
2355 R.clear();
2356 R.insert(Nodes.getNode(0));
2357 break;
2358 }
2359 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2360 SUnit *SU = IE.getSrc();
2361 if (Nodes.count(SU) == 0)
2362 continue;
2363 if (NodeOrder.contains(SU))
2364 continue;
2365 R.insert(SU);
2366 }
2367
2368 // FIXME: The following loop-carried dependencies may also need to be
2369 // considered.
2370 // - Physical register dependnecies (true-dependnece and WAW).
2371 // - Memory dependencies.
2372 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2373 SUnit *SU = OE.getDst();
2374 if (!OE.isAntiDep())
2375 continue;
2376 if (Nodes.count(SU) == 0)
2377 continue;
2378 if (NodeOrder.contains(SU))
2379 continue;
2380 R.insert(SU);
2381 }
2382 }
2383 Order = TopDown;
2384 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2386 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2387 R.insert(N.begin(), N.end());
2388 }
2389 }
2390 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2391 }
2392
2393 LLVM_DEBUG({
2394 dbgs() << "Node order: ";
2395 for (SUnit *I : NodeOrder)
2396 dbgs() << " " << I->NodeNum << " ";
2397 dbgs() << "\n";
2398 });
2399}
2400
2401/// Process the nodes in the computed order and create the pipelined schedule
2402/// of the instructions, if possible. Return true if a schedule is found.
2403bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2404
2405 if (NodeOrder.empty()){
2406 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2407 return false;
2408 }
2409
2410 bool scheduleFound = false;
2411 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2412 if (LimitRegPressure) {
2413 HRPDetector =
2414 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2415 HRPDetector->init(RegClassInfo);
2416 }
2417 // Keep increasing II until a valid schedule is found.
2418 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2419 Schedule.reset();
2420 Schedule.setInitiationInterval(II);
2421 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2422
2423 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2424 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2425 do {
2426 SUnit *SU = *NI;
2427
2428 // Compute the schedule time for the instruction, which is based
2429 // upon the scheduled time for any predecessors/successors.
2430 int EarlyStart = INT_MIN;
2431 int LateStart = INT_MAX;
2432 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2433 LLVM_DEBUG({
2434 dbgs() << "\n";
2435 dbgs() << "Inst (" << SU->NodeNum << ") ";
2436 SU->getInstr()->dump();
2437 dbgs() << "\n";
2438 });
2439 LLVM_DEBUG(
2440 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2441
2442 if (EarlyStart > LateStart)
2443 scheduleFound = false;
2444 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2445 scheduleFound =
2446 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2447 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2448 scheduleFound =
2449 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2450 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2451 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2452 // When scheduling a Phi it is better to start at the late cycle and
2453 // go backwards. The default order may insert the Phi too far away
2454 // from its first dependence.
2455 // Also, do backward search when all scheduled predecessors are
2456 // loop-carried output/order dependencies. Empirically, there are also
2457 // cases where scheduling becomes possible with backward search.
2458 if (SU->getInstr()->isPHI() ||
2459 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2460 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2461 else
2462 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2463 } else {
2464 int FirstCycle = Schedule.getFirstCycle();
2465 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2466 FirstCycle + getASAP(SU) + II - 1, II);
2467 }
2468
2469 // Even if we find a schedule, make sure the schedule doesn't exceed the
2470 // allowable number of stages. We keep trying if this happens.
2471 if (scheduleFound)
2472 if (SwpMaxStages > -1 &&
2473 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2474 scheduleFound = false;
2475
2476 LLVM_DEBUG({
2477 if (!scheduleFound)
2478 dbgs() << "\tCan't schedule\n";
2479 });
2480 } while (++NI != NE && scheduleFound);
2481
2482 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2483 if (scheduleFound)
2484 scheduleFound =
2485 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2486
2487 // If a schedule is found, check if it is a valid schedule too.
2488 if (scheduleFound)
2489 scheduleFound = Schedule.isValidSchedule(this);
2490
2491 // If a schedule was found and the option is enabled, check if the schedule
2492 // might generate additional register spills/fills.
2493 if (scheduleFound && LimitRegPressure)
2494 scheduleFound =
2495 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2496 }
2497
2498 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2499 << " (II=" << Schedule.getInitiationInterval()
2500 << ")\n");
2501
2502 if (scheduleFound) {
2503 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2504 if (!scheduleFound)
2505 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2506 }
2507
2508 if (scheduleFound) {
2509 Schedule.finalizeSchedule(this);
2510 Pass.ORE->emit([&]() {
2512 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2513 << "Schedule found with Initiation Interval: "
2514 << ore::NV("II", Schedule.getInitiationInterval())
2515 << ", MaxStageCount: "
2516 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2517 });
2518 } else
2519 Schedule.reset();
2520
2521 return scheduleFound && Schedule.getMaxStageCount() > 0;
2522}
2523
2525 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2526 Register Result;
2527 for (const MachineOperand &Use : MI.all_uses()) {
2528 Register Reg = Use.getReg();
2529 if (!Reg.isVirtual())
2530 return Register();
2531 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2532 continue;
2533 if (Result)
2534 return Register();
2535 Result = Reg;
2536 }
2537 return Result;
2538}
2539
2540/// When Op is a value that is incremented recursively in a loop and there is a
2541/// unique instruction that increments it, returns true and sets Value.
2543 if (!Op.isReg() || !Op.getReg().isVirtual())
2544 return false;
2545
2546 Register OrgReg = Op.getReg();
2547 Register CurReg = OrgReg;
2548 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2549 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2550
2551 const TargetInstrInfo *TII =
2552 LoopBB->getParent()->getSubtarget().getInstrInfo();
2553 const TargetRegisterInfo *TRI =
2554 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2555
2556 MachineInstr *Phi = nullptr;
2557 MachineInstr *Increment = nullptr;
2558
2559 // Traverse definitions until it reaches Op or an instruction that does not
2560 // satisfy the condition.
2561 // Acceptable example:
2562 // bb.0:
2563 // %0 = PHI %3, %bb.0, ...
2564 // %2 = ADD %0, Value
2565 // ... = LOAD %2(Op)
2566 // %3 = COPY %2
2567 while (true) {
2568 if (!CurReg.isValid() || !CurReg.isVirtual())
2569 return false;
2570 MachineInstr *Def = MRI.getVRegDef(CurReg);
2571 if (Def->getParent() != LoopBB)
2572 return false;
2573
2574 if (Def->isCopy()) {
2575 // Ignore copy instructions unless they contain subregisters
2576 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2577 return false;
2578 CurReg = Def->getOperand(1).getReg();
2579 } else if (Def->isPHI()) {
2580 // There must be just one Phi
2581 if (Phi)
2582 return false;
2583 Phi = Def;
2584 CurReg = getLoopPhiReg(*Def, LoopBB);
2585 } else if (TII->getIncrementValue(*Def, Value)) {
2586 // Potentially a unique increment
2587 if (Increment)
2588 // Multiple increments exist
2589 return false;
2590
2591 const MachineOperand *BaseOp;
2592 int64_t Offset;
2593 bool OffsetIsScalable;
2594 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2595 TRI)) {
2596 // Pre/post increment instruction
2597 CurReg = BaseOp->getReg();
2598 } else {
2599 // If only one of the operands is defined within the loop, it is assumed
2600 // to be an incremented value.
2601 CurReg = findUniqueOperandDefinedInLoop(*Def);
2602 if (!CurReg.isValid())
2603 return false;
2604 }
2605 Increment = Def;
2606 } else {
2607 return false;
2608 }
2609 if (CurReg == OrgReg)
2610 break;
2611 }
2612
2613 if (!Phi || !Increment)
2614 return false;
2615
2616 return true;
2617}
2618
2619/// Return true if we can compute the amount the instruction changes
2620/// during each iteration. Set Delta to the amount of the change.
2621bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2623 const MachineOperand *BaseOp;
2624 int64_t Offset;
2625 bool OffsetIsScalable;
2626 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2627 return false;
2628
2629 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2630 if (OffsetIsScalable)
2631 return false;
2632
2633 if (!BaseOp->isReg())
2634 return false;
2635
2636 return findLoopIncrementValue(*BaseOp, Delta);
2637}
2638
2639/// Check if we can change the instruction to use an offset value from the
2640/// previous iteration. If so, return true and set the base and offset values
2641/// so that we can rewrite the load, if necessary.
2642/// v1 = Phi(v0, v3)
2643/// v2 = load v1, 0
2644/// v3 = post_store v1, 4, x
2645/// This function enables the load to be rewritten as v2 = load v3, 4.
2646bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2647 unsigned &BasePos,
2648 unsigned &OffsetPos,
2649 unsigned &NewBase,
2650 int64_t &Offset) {
2651 // Get the load instruction.
2652 if (TII->isPostIncrement(*MI))
2653 return false;
2654 unsigned BasePosLd, OffsetPosLd;
2655 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2656 return false;
2657 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2658
2659 // Look for the Phi instruction.
2660 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2661 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2662 if (!Phi || !Phi->isPHI())
2663 return false;
2664 // Get the register defined in the loop block.
2665 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2666 if (!PrevReg)
2667 return false;
2668
2669 // Check for the post-increment load/store instruction.
2670 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2671 if (!PrevDef || PrevDef == MI)
2672 return false;
2673
2674 if (!TII->isPostIncrement(*PrevDef))
2675 return false;
2676
2677 unsigned BasePos1 = 0, OffsetPos1 = 0;
2678 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2679 return false;
2680
2681 // Make sure that the instructions do not access the same memory location in
2682 // the next iteration.
2683 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2684 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2686 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2687 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2688 MF.deleteMachineInstr(NewMI);
2689 if (!Disjoint)
2690 return false;
2691
2692 // Set the return value once we determine that we return true.
2693 BasePos = BasePosLd;
2694 OffsetPos = OffsetPosLd;
2695 NewBase = PrevReg;
2696 Offset = StoreOffset;
2697 return true;
2698}
2699
2700/// Apply changes to the instruction if needed. The changes are need
2701/// to improve the scheduling and depend up on the final schedule.
2703 SMSchedule &Schedule) {
2704 SUnit *SU = getSUnit(MI);
2706 InstrChanges.find(SU);
2707 if (It != InstrChanges.end()) {
2708 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2709 unsigned BasePos, OffsetPos;
2710 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2711 return;
2712 Register BaseReg = MI->getOperand(BasePos).getReg();
2713 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2714 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2715 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2716 int BaseStageNum = Schedule.stageScheduled(SU);
2717 int BaseCycleNum = Schedule.cycleScheduled(SU);
2718 if (BaseStageNum < DefStageNum) {
2720 int OffsetDiff = DefStageNum - BaseStageNum;
2721 if (DefCycleNum < BaseCycleNum) {
2722 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2723 if (OffsetDiff > 0)
2724 --OffsetDiff;
2725 }
2726 int64_t NewOffset =
2727 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2728 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2729 SU->setInstr(NewMI);
2730 MISUnitMap[NewMI] = SU;
2731 NewMIs[MI] = NewMI;
2732 }
2733 }
2734}
2735
2736/// Return the instruction in the loop that defines the register.
2737/// If the definition is a Phi, then follow the Phi operand to
2738/// the instruction in the loop.
2739MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2741 MachineInstr *Def = MRI.getVRegDef(Reg);
2742 while (Def->isPHI()) {
2743 if (!Visited.insert(Def).second)
2744 break;
2745 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2746 if (Def->getOperand(i + 1).getMBB() == BB) {
2747 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2748 break;
2749 }
2750 }
2751 return Def;
2752}
2753
2754/// Return false if there is no overlap between the region accessed by BaseMI in
2755/// an iteration and the region accessed by OtherMI in subsequent iterations.
2757 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
2758 int DeltaB, DeltaO, Delta;
2759 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
2760 DeltaB != DeltaO)
2761 return true;
2762 Delta = DeltaB;
2763
2764 const MachineOperand *BaseOpB, *BaseOpO;
2765 int64_t OffsetB, OffsetO;
2766 bool OffsetBIsScalable, OffsetOIsScalable;
2768 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
2769 OffsetBIsScalable, TRI) ||
2770 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
2771 OffsetOIsScalable, TRI))
2772 return true;
2773
2774 if (OffsetBIsScalable || OffsetOIsScalable)
2775 return true;
2776
2777 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
2778 // Pass cases with different base operands but same initial values.
2779 // Typically for when pre/post increment is used.
2780
2781 if (!BaseOpB->isReg() || !BaseOpO->isReg())
2782 return true;
2783 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
2784 if (!RegB.isVirtual() || !RegO.isVirtual())
2785 return true;
2786
2787 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
2788 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
2789 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
2790 return true;
2791
2792 unsigned InitValB = 0;
2793 unsigned LoopValB = 0;
2794 unsigned InitValO = 0;
2795 unsigned LoopValO = 0;
2796 getPhiRegs(*DefB, BB, InitValB, LoopValB);
2797 getPhiRegs(*DefO, BB, InitValO, LoopValO);
2798 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
2799 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
2800
2801 if (!InitDefB->isIdenticalTo(*InitDefO))
2802 return true;
2803 }
2804
2805 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
2806 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
2807
2808 // This is the main test, which checks the offset values and the loop
2809 // increment value to determine if the accesses may be loop carried.
2810 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
2811 return true;
2812
2813 LLVM_DEBUG({
2814 dbgs() << "Overlap check:\n";
2815 dbgs() << " BaseMI: ";
2816 BaseMI->dump();
2817 dbgs() << " Base + " << OffsetB << " + I * " << Delta
2818 << ", Len: " << AccessSizeB.getValue() << "\n";
2819 dbgs() << " OtherMI: ";
2820 OtherMI->dump();
2821 dbgs() << " Base + " << OffsetO << " + I * " << Delta
2822 << ", Len: " << AccessSizeO.getValue() << "\n";
2823 });
2824
2825 // Excessive overlap may be detected in strided patterns.
2826 // For example, the memory addresses of the store and the load in
2827 // for (i=0; i<n; i+=2) a[i+1] = a[i];
2828 // are assumed to overlap.
2829 if (Delta < 0) {
2830 int64_t BaseMinAddr = OffsetB;
2831 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
2832 if (BaseMinAddr > OhterNextIterMaxAddr) {
2833 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
2834 return false;
2835 }
2836 } else {
2837 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
2838 int64_t OtherNextIterMinAddr = OffsetO + Delta;
2839 if (BaseMaxAddr < OtherNextIterMinAddr) {
2840 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
2841 return false;
2842 }
2843 }
2844 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
2845 return true;
2846}
2847
2848/// Return true for an order or output dependence that is loop carried
2849/// potentially. A dependence is loop carried if the destination defines a value
2850/// that may be used or defined by the source in a subsequent iteration.
2852 const SwingSchedulerDDGEdge &Edge) const {
2853 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
2854 Edge.getDst()->isBoundaryNode())
2855 return false;
2856
2858 return true;
2859
2860 if (Edge.isOutputDep())
2861 return true;
2862
2863 MachineInstr *SI = Edge.getSrc()->getInstr();
2864 MachineInstr *DI = Edge.getDst()->getInstr();
2865 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2866
2867 // Assume ordered loads and stores may have a loop carried dependence.
2868 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2869 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2870 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2871 return true;
2872
2873 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2874 return false;
2875
2876 // The conservative assumption is that a dependence between memory operations
2877 // may be loop carried. The following code checks when it can be proved that
2878 // there is no loop carried dependence.
2879 return mayOverlapInLaterIter(DI, SI);
2880}
2881
2882void SwingSchedulerDAG::postProcessDAG() {
2883 for (auto &M : Mutations)
2884 M->apply(this);
2885}
2886
2887/// Try to schedule the node at the specified StartCycle and continue
2888/// until the node is schedule or the EndCycle is reached. This function
2889/// returns true if the node is scheduled. This routine may search either
2890/// forward or backward for a place to insert the instruction based upon
2891/// the relative values of StartCycle and EndCycle.
2892bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2893 bool forward = true;
2894 LLVM_DEBUG({
2895 dbgs() << "Trying to insert node between " << StartCycle << " and "
2896 << EndCycle << " II: " << II << "\n";
2897 });
2898 if (StartCycle > EndCycle)
2899 forward = false;
2900
2901 // The terminating condition depends on the direction.
2902 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2903 for (int curCycle = StartCycle; curCycle != termCycle;
2904 forward ? ++curCycle : --curCycle) {
2905
2906 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2907 ProcItinResources.canReserveResources(*SU, curCycle)) {
2908 LLVM_DEBUG({
2909 dbgs() << "\tinsert at cycle " << curCycle << " ";
2910 SU->getInstr()->dump();
2911 });
2912
2913 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2914 ProcItinResources.reserveResources(*SU, curCycle);
2915 ScheduledInstrs[curCycle].push_back(SU);
2916 InstrToCycle.insert(std::make_pair(SU, curCycle));
2917 if (curCycle > LastCycle)
2918 LastCycle = curCycle;
2919 if (curCycle < FirstCycle)
2920 FirstCycle = curCycle;
2921 return true;
2922 }
2923 LLVM_DEBUG({
2924 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2925 SU->getInstr()->dump();
2926 });
2927 }
2928 return false;
2929}
2930
2931// Return the cycle of the earliest scheduled instruction in the chain.
2933 const SwingSchedulerDDG *DDG) {
2936 Worklist.push_back(Dep);
2937 int EarlyCycle = INT_MAX;
2938 while (!Worklist.empty()) {
2939 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
2940 SUnit *PrevSU = Cur.getSrc();
2941 if (Visited.count(PrevSU))
2942 continue;
2943 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2944 if (it == InstrToCycle.end())
2945 continue;
2946 EarlyCycle = std::min(EarlyCycle, it->second);
2947 for (const auto &IE : DDG->getInEdges(PrevSU))
2948 if (IE.isOrderDep() || IE.isOutputDep())
2949 Worklist.push_back(IE);
2950 Visited.insert(PrevSU);
2951 }
2952 return EarlyCycle;
2953}
2954
2955// Return the cycle of the latest scheduled instruction in the chain.
2957 const SwingSchedulerDDG *DDG) {
2960 Worklist.push_back(Dep);
2961 int LateCycle = INT_MIN;
2962 while (!Worklist.empty()) {
2963 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
2964 SUnit *SuccSU = Cur.getDst();
2965 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2966 continue;
2967 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2968 if (it == InstrToCycle.end())
2969 continue;
2970 LateCycle = std::max(LateCycle, it->second);
2971 for (const auto &OE : DDG->getOutEdges(SuccSU))
2972 if (OE.isOrderDep() || OE.isOutputDep())
2973 Worklist.push_back(OE);
2974 Visited.insert(SuccSU);
2975 }
2976 return LateCycle;
2977}
2978
2979/// If an instruction has a use that spans multiple iterations, then
2980/// return true. These instructions are characterized by having a back-ege
2981/// to a Phi, which contains a reference to another Phi.
2983 for (auto &P : SU->Preds)
2984 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
2985 for (auto &S : P.getSUnit()->Succs)
2986 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2987 return P.getSUnit();
2988 return nullptr;
2989}
2990
2991/// Compute the scheduling start slot for the instruction. The start slot
2992/// depends on any predecessor or successor nodes scheduled already.
2993void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2994 int II, SwingSchedulerDAG *DAG) {
2995 const SwingSchedulerDDG *DDG = DAG->getDDG();
2996
2997 // Iterate over each instruction that has been scheduled already. The start
2998 // slot computation depends on whether the previously scheduled instruction
2999 // is a predecessor or successor of the specified instruction.
3000 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3001 for (SUnit *I : getInstructions(cycle)) {
3002 for (const auto &IE : DDG->getInEdges(SU)) {
3003 if (IE.getSrc() == I) {
3004 // FIXME: Add reverse edge to `DDG` instead of calling
3005 // `isLoopCarriedDep`
3006 if (DAG->isLoopCarriedDep(IE)) {
3007 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3008 *MinLateStart = std::min(*MinLateStart, End);
3009 }
3010 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3011 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3012 }
3013 }
3014
3015 for (const auto &OE : DDG->getOutEdges(SU)) {
3016 if (OE.getDst() == I) {
3017 // FIXME: Add reverse edge to `DDG` instead of calling
3018 // `isLoopCarriedDep`
3019 if (DAG->isLoopCarriedDep(OE)) {
3020 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3021 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3022 }
3023 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3024 *MinLateStart = std::min(*MinLateStart, LateStart);
3025 }
3026 }
3027
3028 SUnit *BE = multipleIterations(I, DAG);
3029 for (const auto &Dep : SU->Preds) {
3030 // For instruction that requires multiple iterations, make sure that
3031 // the dependent instruction is not scheduled past the definition.
3032 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3033 !SU->isPred(I))
3034 *MinLateStart = std::min(*MinLateStart, cycle);
3035 }
3036 }
3037 }
3038}
3039
3040/// Order the instructions within a cycle so that the definitions occur
3041/// before the uses. Returns true if the instruction is added to the start
3042/// of the list, or false if added to the end.
3044 std::deque<SUnit *> &Insts) const {
3045 MachineInstr *MI = SU->getInstr();
3046 bool OrderBeforeUse = false;
3047 bool OrderAfterDef = false;
3048 bool OrderBeforeDef = false;
3049 unsigned MoveDef = 0;
3050 unsigned MoveUse = 0;
3051 int StageInst1 = stageScheduled(SU);
3052 const SwingSchedulerDDG *DDG = SSD->getDDG();
3053
3054 unsigned Pos = 0;
3055 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3056 ++I, ++Pos) {
3057 for (MachineOperand &MO : MI->operands()) {
3058 if (!MO.isReg() || !MO.getReg().isVirtual())
3059 continue;
3060
3061 Register Reg = MO.getReg();
3062 unsigned BasePos, OffsetPos;
3063 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3064 if (MI->getOperand(BasePos).getReg() == Reg)
3065 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3066 Reg = NewReg;
3067 bool Reads, Writes;
3068 std::tie(Reads, Writes) =
3069 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3070 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3071 OrderBeforeUse = true;
3072 if (MoveUse == 0)
3073 MoveUse = Pos;
3074 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3075 // Add the instruction after the scheduled instruction.
3076 OrderAfterDef = true;
3077 MoveDef = Pos;
3078 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3079 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3080 OrderBeforeUse = true;
3081 if (MoveUse == 0)
3082 MoveUse = Pos;
3083 } else {
3084 OrderAfterDef = true;
3085 MoveDef = Pos;
3086 }
3087 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3088 OrderBeforeUse = true;
3089 if (MoveUse == 0)
3090 MoveUse = Pos;
3091 if (MoveUse != 0) {
3092 OrderAfterDef = true;
3093 MoveDef = Pos - 1;
3094 }
3095 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3096 // Add the instruction before the scheduled instruction.
3097 OrderBeforeUse = true;
3098 if (MoveUse == 0)
3099 MoveUse = Pos;
3100 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3101 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3102 if (MoveUse == 0) {
3103 OrderBeforeDef = true;
3104 MoveUse = Pos;
3105 }
3106 }
3107 }
3108 // Check for order dependences between instructions. Make sure the source
3109 // is ordered before the destination.
3110 for (auto &OE : DDG->getOutEdges(SU)) {
3111 if (OE.getDst() != *I)
3112 continue;
3113 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3114 OrderBeforeUse = true;
3115 if (Pos < MoveUse)
3116 MoveUse = Pos;
3117 }
3118 // We did not handle HW dependences in previous for loop,
3119 // and we normally set Latency = 0 for Anti/Output deps,
3120 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3121 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3122 stageScheduled(*I) == StageInst1) {
3123 OrderBeforeUse = true;
3124 if ((MoveUse == 0) || (Pos < MoveUse))
3125 MoveUse = Pos;
3126 }
3127 }
3128 for (auto &IE : DDG->getInEdges(SU)) {
3129 if (IE.getSrc() != *I)
3130 continue;
3131 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3132 stageScheduled(*I) == StageInst1) {
3133 OrderAfterDef = true;
3134 MoveDef = Pos;
3135 }
3136 }
3137 }
3138
3139 // A circular dependence.
3140 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3141 OrderBeforeUse = false;
3142
3143 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3144 // to a loop-carried dependence.
3145 if (OrderBeforeDef)
3146 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3147
3148 // The uncommon case when the instruction order needs to be updated because
3149 // there is both a use and def.
3150 if (OrderBeforeUse && OrderAfterDef) {
3151 SUnit *UseSU = Insts.at(MoveUse);
3152 SUnit *DefSU = Insts.at(MoveDef);
3153 if (MoveUse > MoveDef) {
3154 Insts.erase(Insts.begin() + MoveUse);
3155 Insts.erase(Insts.begin() + MoveDef);
3156 } else {
3157 Insts.erase(Insts.begin() + MoveDef);
3158 Insts.erase(Insts.begin() + MoveUse);
3159 }
3160 orderDependence(SSD, UseSU, Insts);
3161 orderDependence(SSD, SU, Insts);
3162 orderDependence(SSD, DefSU, Insts);
3163 return;
3164 }
3165 // Put the new instruction first if there is a use in the list. Otherwise,
3166 // put it at the end of the list.
3167 if (OrderBeforeUse)
3168 Insts.push_front(SU);
3169 else
3170 Insts.push_back(SU);
3171}
3172
3173/// Return true if the scheduled Phi has a loop carried operand.
3175 MachineInstr &Phi) const {
3176 if (!Phi.isPHI())
3177 return false;
3178 assert(Phi.isPHI() && "Expecting a Phi.");
3179 SUnit *DefSU = SSD->getSUnit(&Phi);
3180 unsigned DefCycle = cycleScheduled(DefSU);
3181 int DefStage = stageScheduled(DefSU);
3182
3183 unsigned InitVal = 0;
3184 unsigned LoopVal = 0;
3185 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3186 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3187 if (!UseSU)
3188 return true;
3189 if (UseSU->getInstr()->isPHI())
3190 return true;
3191 unsigned LoopCycle = cycleScheduled(UseSU);
3192 int LoopStage = stageScheduled(UseSU);
3193 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3194}
3195
3196/// Return true if the instruction is a definition that is loop carried
3197/// and defines the use on the next iteration.
3198/// v1 = phi(v2, v3)
3199/// (Def) v3 = op v1
3200/// (MO) = v1
3201/// If MO appears before Def, then v1 and v3 may get assigned to the same
3202/// register.
3204 MachineInstr *Def,
3205 MachineOperand &MO) const {
3206 if (!MO.isReg())
3207 return false;
3208 if (Def->isPHI())
3209 return false;
3210 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3211 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3212 return false;
3213 if (!isLoopCarried(SSD, *Phi))
3214 return false;
3215 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3216 for (MachineOperand &DMO : Def->all_defs()) {
3217 if (DMO.getReg() == LoopReg)
3218 return true;
3219 }
3220 return false;
3221}
3222
3223/// Return true if all scheduled predecessors are loop-carried output/order
3224/// dependencies.
3226 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3227 for (const auto &IE : DDG->getInEdges(SU))
3228 if (InstrToCycle.count(IE.getSrc()))
3229 return false;
3230 return true;
3231}
3232
3233/// Determine transitive dependences of unpipelineable instructions
3236 SmallSet<SUnit *, 8> DoNotPipeline;
3237 SmallVector<SUnit *, 8> Worklist;
3238
3239 for (auto &SU : SSD->SUnits)
3240 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3241 Worklist.push_back(&SU);
3242
3243 const SwingSchedulerDDG *DDG = SSD->getDDG();
3244 while (!Worklist.empty()) {
3245 auto SU = Worklist.pop_back_val();
3246 if (DoNotPipeline.count(SU))
3247 continue;
3248 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3249 DoNotPipeline.insert(SU);
3250 for (const auto &IE : DDG->getInEdges(SU))
3251 Worklist.push_back(IE.getSrc());
3252
3253 // To preserve previous behavior and prevent regression
3254 // FIXME: Remove if this doesn't have significant impact on
3255 for (const auto &OE : DDG->getOutEdges(SU))
3256 if (OE.getDistance() == 1)
3257 Worklist.push_back(OE.getDst());
3258 }
3259 return DoNotPipeline;
3260}
3261
3262// Determine all instructions upon which any unpipelineable instruction depends
3263// and ensure that they are in stage 0. If unable to do so, return false.
3267
3268 int NewLastCycle = INT_MIN;
3269 for (SUnit &SU : SSD->SUnits) {
3270 if (!SU.isInstr())
3271 continue;
3272 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3273 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3274 continue;
3275 }
3276
3277 // Put the non-pipelined instruction as early as possible in the schedule
3278 int NewCycle = getFirstCycle();
3279 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3280 if (IE.getDistance() == 0)
3281 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3282
3283 // To preserve previous behavior and prevent regression
3284 // FIXME: Remove if this doesn't have significant impact on performance
3285 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3286 if (OE.getDistance() == 1)
3287 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3288
3289 int OldCycle = InstrToCycle[&SU];
3290 if (OldCycle != NewCycle) {
3291 InstrToCycle[&SU] = NewCycle;
3292 auto &OldS = getInstructions(OldCycle);
3293 llvm::erase(OldS, &SU);
3294 getInstructions(NewCycle).emplace_back(&SU);
3295 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3296 << ") is not pipelined; moving from cycle " << OldCycle
3297 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3298 }
3299 NewLastCycle = std::max(NewLastCycle, NewCycle);
3300 }
3301 LastCycle = NewLastCycle;
3302 return true;
3303}
3304
3305// Check if the generated schedule is valid. This function checks if
3306// an instruction that uses a physical register is scheduled in a
3307// different stage than the definition. The pipeliner does not handle
3308// physical register values that may cross a basic block boundary.
3309// Furthermore, if a physical def/use pair is assigned to the same
3310// cycle, orderDependence does not guarantee def/use ordering, so that
3311// case should be considered invalid. (The test checks for both
3312// earlier and same-cycle use to be more robust.)
3314 for (SUnit &SU : SSD->SUnits) {
3315 if (!SU.hasPhysRegDefs)
3316 continue;
3317 int StageDef = stageScheduled(&SU);
3318 int CycleDef = InstrToCycle[&SU];
3319 assert(StageDef != -1 && "Instruction should have been scheduled.");
3320 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3321 SUnit *Dst = OE.getDst();
3322 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3323 if (OE.getReg().isPhysical()) {
3324 if (stageScheduled(Dst) != StageDef)
3325 return false;
3326 if (InstrToCycle[Dst] <= CycleDef)
3327 return false;
3328 }
3329 }
3330 }
3331 return true;
3332}
3333
3334/// A property of the node order in swing-modulo-scheduling is
3335/// that for nodes outside circuits the following holds:
3336/// none of them is scheduled after both a successor and a
3337/// predecessor.
3338/// The method below checks whether the property is met.
3339/// If not, debug information is printed and statistics information updated.
3340/// Note that we do not use an assert statement.
3341/// The reason is that although an invalid node order may prevent
3342/// the pipeliner from finding a pipelined schedule for arbitrary II,
3343/// it does not lead to the generation of incorrect code.
3344void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3345
3346 // a sorted vector that maps each SUnit to its index in the NodeOrder
3347 typedef std::pair<SUnit *, unsigned> UnitIndex;
3348 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3349
3350 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3351 Indices.push_back(std::make_pair(NodeOrder[i], i));
3352
3353 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3354 return std::get<0>(i1) < std::get<0>(i2);
3355 };
3356
3357 // sort, so that we can perform a binary search
3358 llvm::sort(Indices, CompareKey);
3359
3360 bool Valid = true;
3361 (void)Valid;
3362 // for each SUnit in the NodeOrder, check whether
3363 // it appears after both a successor and a predecessor
3364 // of the SUnit. If this is the case, and the SUnit
3365 // is not part of circuit, then the NodeOrder is not
3366 // valid.
3367 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3368 SUnit *SU = NodeOrder[i];
3369 unsigned Index = i;
3370
3371 bool PredBefore = false;
3372 bool SuccBefore = false;
3373
3374 SUnit *Succ;
3375 SUnit *Pred;
3376 (void)Succ;
3377 (void)Pred;
3378
3379 for (const auto &IE : DDG->getInEdges(SU)) {
3380 SUnit *PredSU = IE.getSrc();
3381 unsigned PredIndex = std::get<1>(
3382 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3383 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3384 PredBefore = true;
3385 Pred = PredSU;
3386 break;
3387 }
3388 }
3389
3390 for (const auto &OE : DDG->getOutEdges(SU)) {
3391 SUnit *SuccSU = OE.getDst();
3392 // Do not process a boundary node, it was not included in NodeOrder,
3393 // hence not in Indices either, call to std::lower_bound() below will
3394 // return Indices.end().
3395 if (SuccSU->isBoundaryNode())
3396 continue;
3397 unsigned SuccIndex = std::get<1>(
3398 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3399 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3400 SuccBefore = true;
3401 Succ = SuccSU;
3402 break;
3403 }
3404 }
3405
3406 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3407 // instructions in circuits are allowed to be scheduled
3408 // after both a successor and predecessor.
3409 bool InCircuit = llvm::any_of(
3410 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3411 if (InCircuit)
3412 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3413 else {
3414 Valid = false;
3415 NumNodeOrderIssues++;
3416 LLVM_DEBUG(dbgs() << "Predecessor ");
3417 }
3418 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3419 << " are scheduled before node " << SU->NodeNum
3420 << "\n");
3421 }
3422 }
3423
3424 LLVM_DEBUG({
3425 if (!Valid)
3426 dbgs() << "Invalid node order found!\n";
3427 });
3428}
3429
3430/// Attempt to fix the degenerate cases when the instruction serialization
3431/// causes the register lifetimes to overlap. For example,
3432/// p' = store_pi(p, b)
3433/// = load p, offset
3434/// In this case p and p' overlap, which means that two registers are needed.
3435/// Instead, this function changes the load to use p' and updates the offset.
3436void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3437 unsigned OverlapReg = 0;
3438 unsigned NewBaseReg = 0;
3439 for (SUnit *SU : Instrs) {
3440 MachineInstr *MI = SU->getInstr();
3441 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3442 const MachineOperand &MO = MI->getOperand(i);
3443 // Look for an instruction that uses p. The instruction occurs in the
3444 // same cycle but occurs later in the serialized order.
3445 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3446 // Check that the instruction appears in the InstrChanges structure,
3447 // which contains instructions that can have the offset updated.
3449 InstrChanges.find(SU);
3450 if (It != InstrChanges.end()) {
3451 unsigned BasePos, OffsetPos;
3452 // Update the base register and adjust the offset.
3453 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3455 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3456 int64_t NewOffset =
3457 MI->getOperand(OffsetPos).getImm() - It->second.second;
3458 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3459 SU->setInstr(NewMI);
3460 MISUnitMap[NewMI] = SU;
3461 NewMIs[MI] = NewMI;
3462 }
3463 }
3464 OverlapReg = 0;
3465 NewBaseReg = 0;
3466 break;
3467 }
3468 // Look for an instruction of the form p' = op(p), which uses and defines
3469 // two virtual registers that get allocated to the same physical register.
3470 unsigned TiedUseIdx = 0;
3471 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3472 // OverlapReg is p in the example above.
3473 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3474 // NewBaseReg is p' in the example above.
3475 NewBaseReg = MI->getOperand(i).getReg();
3476 break;
3477 }
3478 }
3479 }
3480}
3481
3482std::deque<SUnit *>
3484 const std::deque<SUnit *> &Instrs) const {
3485 std::deque<SUnit *> NewOrderPhi;
3486 for (SUnit *SU : Instrs) {
3487 if (SU->getInstr()->isPHI())
3488 NewOrderPhi.push_back(SU);
3489 }
3490 std::deque<SUnit *> NewOrderI;
3491 for (SUnit *SU : Instrs) {
3492 if (!SU->getInstr()->isPHI())
3493 orderDependence(SSD, SU, NewOrderI);
3494 }
3495 llvm::append_range(NewOrderPhi, NewOrderI);
3496 return NewOrderPhi;
3497}
3498
3499/// After the schedule has been formed, call this function to combine
3500/// the instructions from the different stages/cycles. That is, this
3501/// function creates a schedule that represents a single iteration.
3503 // Move all instructions to the first stage from later stages.
3504 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3505 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3506 ++stage) {
3507 std::deque<SUnit *> &cycleInstrs =
3508 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3509 for (SUnit *SU : llvm::reverse(cycleInstrs))
3510 ScheduledInstrs[cycle].push_front(SU);
3511 }
3512 }
3513
3514 // Erase all the elements in the later stages. Only one iteration should
3515 // remain in the scheduled list, and it contains all the instructions.
3516 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3517 ScheduledInstrs.erase(cycle);
3518
3519 // Change the registers in instruction as specified in the InstrChanges
3520 // map. We need to use the new registers to create the correct order.
3521 for (const SUnit &SU : SSD->SUnits)
3522 SSD->applyInstrChange(SU.getInstr(), *this);
3523
3524 // Reorder the instructions in each cycle to fix and improve the
3525 // generated code.
3526 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3527 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3528 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3529 SSD->fixupRegisterOverlaps(cycleInstrs);
3530 }
3531
3532 LLVM_DEBUG(dump(););
3533}
3534
3536 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3537 << " depth " << MaxDepth << " col " << Colocate << "\n";
3538 for (const auto &I : Nodes)
3539 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3540 os << "\n";
3541}
3542
3543#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3544/// Print the schedule information to the given output.
3546 // Iterate over each cycle.
3547 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3548 // Iterate over each instruction in the cycle.
3549 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3550 for (SUnit *CI : cycleInstrs->second) {
3551 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3552 os << "(" << CI->NodeNum << ") ";
3553 CI->getInstr()->print(os);
3554 os << "\n";
3555 }
3556 }
3557}
3558
3559/// Utility function used for debugging to print the schedule.
3562
3563void ResourceManager::dumpMRT() const {
3564 LLVM_DEBUG({
3565 if (UseDFA)
3566 return;
3567 std::stringstream SS;
3568 SS << "MRT:\n";
3569 SS << std::setw(4) << "Slot";
3570 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3571 SS << std::setw(3) << I;
3572 SS << std::setw(7) << "#Mops"
3573 << "\n";
3574 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3575 SS << std::setw(4) << Slot;
3576 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3577 SS << std::setw(3) << MRT[Slot][I];
3578 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3579 }
3580 dbgs() << SS.str();
3581 });
3582}
3583#endif
3584
3586 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3587 unsigned ProcResourceID = 0;
3588
3589 // We currently limit the resource kinds to 64 and below so that we can use
3590 // uint64_t for Masks
3591 assert(SM.getNumProcResourceKinds() < 64 &&
3592 "Too many kinds of resources, unsupported");
3593 // Create a unique bitmask for every processor resource unit.
3594 // Skip resource at index 0, since it always references 'InvalidUnit'.
3595 Masks.resize(SM.getNumProcResourceKinds());
3596 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3598 if (Desc.SubUnitsIdxBegin)
3599 continue;
3600 Masks[I] = 1ULL << ProcResourceID;
3601 ProcResourceID++;
3602 }
3603 // Create a unique bitmask for every processor resource group.
3604 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3606 if (!Desc.SubUnitsIdxBegin)
3607 continue;
3608 Masks[I] = 1ULL << ProcResourceID;
3609 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3610 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3611 ProcResourceID++;
3612 }
3613 LLVM_DEBUG({
3614 if (SwpShowResMask) {
3615 dbgs() << "ProcResourceDesc:\n";
3616 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3617 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3618 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3619 ProcResource->Name, I, Masks[I],
3620 ProcResource->NumUnits);
3621 }
3622 dbgs() << " -----------------\n";
3623 }
3624 });
3625}
3626
3628 LLVM_DEBUG({
3629 if (SwpDebugResource)
3630 dbgs() << "canReserveResources:\n";
3631 });
3632 if (UseDFA)
3633 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3634 ->canReserveResources(&SU.getInstr()->getDesc());
3635
3636 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3637 if (!SCDesc->isValid()) {
3638 LLVM_DEBUG({
3639 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3640 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3641 });
3642 return true;
3643 }
3644
3645 reserveResources(SCDesc, Cycle);
3646 bool Result = !isOverbooked();
3647 unreserveResources(SCDesc, Cycle);
3648
3649 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3650 return Result;
3651}
3652
3653void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3654 LLVM_DEBUG({
3655 if (SwpDebugResource)
3656 dbgs() << "reserveResources:\n";
3657 });
3658 if (UseDFA)
3659 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3660 ->reserveResources(&SU.getInstr()->getDesc());
3661
3662 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3663 if (!SCDesc->isValid()) {
3664 LLVM_DEBUG({
3665 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3666 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3667 });
3668 return;
3669 }
3670
3671 reserveResources(SCDesc, Cycle);
3672
3673 LLVM_DEBUG({
3674 if (SwpDebugResource) {
3675 dumpMRT();
3676 dbgs() << "reserveResources: done!\n\n";
3677 }
3678 });
3679}
3680
3681void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3682 int Cycle) {
3683 assert(!UseDFA);
3684 for (const MCWriteProcResEntry &PRE : make_range(
3685 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3686 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3687 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3688
3689 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3690 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3691}
3692
3693void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3694 int Cycle) {
3695 assert(!UseDFA);
3696 for (const MCWriteProcResEntry &PRE : make_range(
3697 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3698 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3699 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3700
3701 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3702 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3703}
3704
3705bool ResourceManager::isOverbooked() const {
3706 assert(!UseDFA);
3707 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3708 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3710 if (MRT[Slot][I] > Desc->NumUnits)
3711 return true;
3712 }
3713 if (NumScheduledMops[Slot] > IssueWidth)
3714 return true;
3715 }
3716 return false;
3717}
3718
3719int ResourceManager::calculateResMIIDFA() const {
3720 assert(UseDFA);
3721
3722 // Sort the instructions by the number of available choices for scheduling,
3723 // least to most. Use the number of critical resources as the tie breaker.
3724 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3725 for (SUnit &SU : DAG->SUnits)
3726 FUS.calcCriticalResources(*SU.getInstr());
3728 FuncUnitOrder(FUS);
3729
3730 for (SUnit &SU : DAG->SUnits)
3731 FuncUnitOrder.push(SU.getInstr());
3732
3734 Resources.push_back(
3735 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3736
3737 while (!FuncUnitOrder.empty()) {
3738 MachineInstr *MI = FuncUnitOrder.top();
3739 FuncUnitOrder.pop();
3740 if (TII->isZeroCost(MI->getOpcode()))
3741 continue;
3742
3743 // Attempt to reserve the instruction in an existing DFA. At least one
3744 // DFA is needed for each cycle.
3745 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3746 unsigned ReservedCycles = 0;
3747 auto *RI = Resources.begin();
3748 auto *RE = Resources.end();
3749 LLVM_DEBUG({
3750 dbgs() << "Trying to reserve resource for " << NumCycles
3751 << " cycles for \n";
3752 MI->dump();
3753 });
3754 for (unsigned C = 0; C < NumCycles; ++C)
3755 while (RI != RE) {
3756 if ((*RI)->canReserveResources(*MI)) {
3757 (*RI)->reserveResources(*MI);
3758 ++ReservedCycles;
3759 break;
3760 }
3761 RI++;
3762 }
3763 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3764 << ", NumCycles:" << NumCycles << "\n");
3765 // Add new DFAs, if needed, to reserve resources.
3766 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3768 << "NewResource created to reserve resources"
3769 << "\n");
3770 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3771 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3772 NewResource->reserveResources(*MI);
3773 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3774 }
3775 }
3776
3777 int Resmii = Resources.size();
3778 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3779 return Resmii;
3780}
3781
3783 if (UseDFA)
3784 return calculateResMIIDFA();
3785
3786 // Count each resource consumption and divide it by the number of units.
3787 // ResMII is the max value among them.
3788
3789 int NumMops = 0;
3791 for (SUnit &SU : DAG->SUnits) {
3792 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3793 continue;
3794
3795 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3796 if (!SCDesc->isValid())
3797 continue;
3798
3799 LLVM_DEBUG({
3800 if (SwpDebugResource) {
3801 DAG->dumpNode(SU);
3802 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3803 << " WriteProcRes: ";
3804 }
3805 });
3806 NumMops += SCDesc->NumMicroOps;
3807 for (const MCWriteProcResEntry &PRE :
3808 make_range(STI->getWriteProcResBegin(SCDesc),
3809 STI->getWriteProcResEnd(SCDesc))) {
3810 LLVM_DEBUG({
3811 if (SwpDebugResource) {
3812 const MCProcResourceDesc *Desc =
3813 SM.getProcResource(PRE.ProcResourceIdx);
3814 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3815 }
3816 });
3817 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3818 }
3819 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3820 }
3821
3822 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3823 LLVM_DEBUG({
3824 if (SwpDebugResource)
3825 dbgs() << "#Mops: " << NumMops << ", "
3826 << "IssueWidth: " << IssueWidth << ", "
3827 << "Cycles: " << Result << "\n";
3828 });
3829
3830 LLVM_DEBUG({
3831 if (SwpDebugResource) {
3832 std::stringstream SS;
3833 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3834 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3835 << "\n";
3836 dbgs() << SS.str();
3837 }
3838 });
3839 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3841 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3842 LLVM_DEBUG({
3843 if (SwpDebugResource) {
3844 std::stringstream SS;
3845 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3846 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3847 << std::setw(10) << Cycles << "\n";
3848 dbgs() << SS.str();
3849 }
3850 });
3851 if (Cycles > Result)
3852 Result = Cycles;
3853 }
3854 return Result;
3855}
3856
3858 InitiationInterval = II;
3859 DFAResources.clear();
3860 DFAResources.resize(II);
3861 for (auto &I : DFAResources)
3862 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3863 MRT.clear();
3865 NumScheduledMops.clear();
3866 NumScheduledMops.resize(II);
3867}
3868
3869bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
3870 if (Pred.isArtificial() || Dst->isBoundaryNode())
3871 return true;
3872 // Currently, dependence that is an anti-dependences but not a loop-carried is
3873 // also ignored. This behavior is preserved to prevent regression.
3874 // FIXME: Remove if this doesn't have significant impact on performance
3875 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
3876}
3877
3878SwingSchedulerDDG::SwingSchedulerDDGEdges &
3879SwingSchedulerDDG::getEdges(const SUnit *SU) {
3880 if (SU == EntrySU)
3881 return EntrySUEdges;
3882 if (SU == ExitSU)
3883 return ExitSUEdges;
3884 return EdgesVec[SU->NodeNum];
3885}
3886
3887const SwingSchedulerDDG::SwingSchedulerDDGEdges &
3888SwingSchedulerDDG::getEdges(const SUnit *SU) const {
3889 if (SU == EntrySU)
3890 return EntrySUEdges;
3891 if (SU == ExitSU)
3892 return ExitSUEdges;
3893 return EdgesVec[SU->NodeNum];
3894}
3895
3896void SwingSchedulerDDG::addEdge(const SUnit *SU,
3897 const SwingSchedulerDDGEdge &Edge) {
3898 auto &Edges = getEdges(SU);
3899 if (Edge.getSrc() == SU)
3900 Edges.Succs.push_back(Edge);
3901 else
3902 Edges.Preds.push_back(Edge);
3903}
3904
3905void SwingSchedulerDDG::initEdges(SUnit *SU) {
3906 for (const auto &PI : SU->Preds) {
3907 SwingSchedulerDDGEdge Edge(SU, PI, false);
3908 addEdge(SU, Edge);
3909 }
3910
3911 for (const auto &SI : SU->Succs) {
3912 SwingSchedulerDDGEdge Edge(SU, SI, true);
3913 addEdge(SU, Edge);
3914 }
3915}
3916
3917SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
3918 SUnit *ExitSU)
3919 : EntrySU(EntrySU), ExitSU(ExitSU) {
3920 EdgesVec.resize(SUnits.size());
3921
3922 initEdges(EntrySU);
3923 initEdges(ExitSU);
3924 for (auto &SU : SUnits)
3925 initEdges(&SU);
3926}
3927
3930 return getEdges(SU).Preds;
3931}
3932
3935 return getEdges(SU).Succs;
3936}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
A possibly irreducible generalization of a Loop.
bool getIncrementValue(const MachineInstr &MI, int &Value) const override
If the instruction is an increment of a constant value, return the amount.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:426
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1432
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:895
A single uniqued string.
Definition: Metadata.h:724
StringRef getString() const
Definition: Metadata.cpp:616
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:574
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:808
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:938
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:764
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
void dump() const
Definition: Pass.cpp:136
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:121
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
SUnit EntrySU
Special node for the region entry.
Definition: ScheduleDAG.h:580
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool isArtificial() const
Returns true if the edge represents an artificial dependence.
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
bool isOrderDep() const
Returns true if the edge represents a dependence that is not data, anti or output dependence.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
bool isOutputDep() const
Returns true if the edge represents output dependence.
Represents dependencies between instructions.
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU)
const EdgesType & getInEdges(const SUnit *SU) const
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
TargetInstrInfo - Interface to description of machine instruction set.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:81
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
unsigned getPosition() const
Definition: CommandLine.h:306
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:34
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
bool isValid() const
Definition: MCSchedule.h:139
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:256
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:363
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:352
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:337
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:356
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:66
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
const MachineDominatorTree * MDT
RegisterClassInfo * RegClassInfo
const MachineLoopInfo * MLI
const TargetMachine * TM
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.