LLVM 22.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/STLExtras.h"
33#include "llvm/MC/LaneBitmask.h"
35
36#define DEBUG_TYPE "machine-scheduler"
37
38using namespace llvm;
39
41 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
42 cl::desc("Disable unclustered high register pressure "
43 "reduction scheduling stage."),
44 cl::init(false));
45
47 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
48 cl::desc("Disable clustered low occupancy "
49 "rescheduling for ILP scheduling stage."),
50 cl::init(false));
51
53 "amdgpu-schedule-metric-bias", cl::Hidden,
55 "Sets the bias which adds weight to occupancy vs latency. Set it to "
56 "100 to chase the occupancy only."),
57 cl::init(10));
58
59static cl::opt<bool>
60 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
61 cl::desc("Relax occupancy targets for kernels which are memory "
62 "bound (amdgpu-membound-threshold), or "
63 "Wave Limited (amdgpu-limit-wave-threshold)."),
64 cl::init(false));
65
67 "amdgpu-use-amdgpu-trackers", cl::Hidden,
68 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
69 cl::init(false));
70
71const unsigned ScheduleMetrics::ScaleFactor = 100;
72
74 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
75 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
76}
77
80
81 MF = &DAG->MF;
82
84
86 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
88 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
89
91 // Set the initial TargetOccupnacy to the maximum occupancy that we can
92 // achieve for this function. This effectively sets a lower bound on the
93 // 'Critical' register limits in the scheduler.
94 // Allow for lower occupancy targets if kernel is wave limited or memory
95 // bound, and using the relaxed occupancy feature.
99 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
100
101 if (!KnownExcessRP) {
102 VGPRCriticalLimit = std::min(
103 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
105 } else {
106 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
107 // returns a reasonably small number for targets with lots of VGPRs, such
108 // as GFX10 and GFX11.
109 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
110 "VGPRCriticalLimit calculation method.\n");
111 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
112 unsigned Granule =
113 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
114 unsigned Addressable =
115 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
116 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
117 VGPRBudget = std::max(VGPRBudget, Granule);
118 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
119 }
120
121 // Subtract error margin and bias from register limits and avoid overflow.
126
127 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
128 << ", VGPRExcessLimit = " << VGPRExcessLimit
129 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
130 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
131}
132
133/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
134/// current register pressure.
135///
136/// This works for the common case, but it has a few exceptions that have been
137/// observed through trial and error:
138/// - Explicit physical register operands
139/// - Subregister definitions
140///
141/// In both of those cases, PressureDiff doesn't represent the actual pressure,
142/// and querying LiveIntervals through the RegPressureTracker is needed to get
143/// an accurate value.
144///
145/// We should eventually only use PressureDiff for maximum performance, but this
146/// already allows 80% of SUs to take the fast path without changing scheduling
147/// at all. Further changes would either change scheduling, or require a lot
148/// more logic to recover an accurate pressure estimate from the PressureDiffs.
149static bool canUsePressureDiffs(const SUnit &SU) {
150 if (!SU.isInstr())
151 return false;
152
153 // Cannot use pressure diffs for subregister defs or with physregs, it's
154 // imprecise in both cases.
155 for (const auto &Op : SU.getInstr()->operands()) {
156 if (!Op.isReg() || Op.isImplicit())
157 continue;
158 if (Op.getReg().isPhysical() ||
159 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
160 return false;
161 }
162 return true;
163}
164
166 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
167 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
168 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
169 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
170 // getDownwardPressure() and getUpwardPressure() make temporary changes to
171 // the tracker, so we need to pass those function a non-const copy.
172 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
173 if (!GCNTrackers) {
174 AtTop
175 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
176 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
177
178 return;
179 }
180
181 // GCNTrackers
182 Pressure.resize(4, 0);
183 MachineInstr *MI = SU->getInstr();
184 GCNRegPressure NewPressure;
185 if (AtTop) {
186 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
187 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
188 } else {
189 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
190 TempUpwardTracker.recede(*MI);
191 NewPressure = TempUpwardTracker.getPressure();
192 }
193 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
194 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
195 NewPressure.getArchVGPRNum();
196 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
197}
198
200 bool AtTop,
201 const RegPressureTracker &RPTracker,
202 const SIRegisterInfo *SRI,
203 unsigned SGPRPressure,
204 unsigned VGPRPressure, bool IsBottomUp) {
205 Cand.SU = SU;
206 Cand.AtTop = AtTop;
207
208 if (!DAG->isTrackingPressure())
209 return;
210
211 Pressure.clear();
212 MaxPressure.clear();
213
214 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
215 // possible over querying the RegPressureTracker.
216 //
217 // RegPressureTracker will make a lot of LIS queries which are very
218 // expensive, it is considered a slow function in this context.
219 //
220 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
221 // trivial lookup into an array. It is pretty much free.
222 //
223 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
224 // PressureDiffs.
225 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
226 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
228 } else {
229 // Reserve 4 slots.
230 Pressure.resize(4, 0);
231 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
232 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
233
234 for (const auto &Diff : DAG->getPressureDiff(SU)) {
235 if (!Diff.isValid())
236 continue;
237 // PressureDiffs is always bottom-up so if we're working top-down we need
238 // to invert its sign.
239 Pressure[Diff.getPSet()] +=
240 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
241 }
242
243#ifdef EXPENSIVE_CHECKS
244 std::vector<unsigned> CheckPressure, CheckMaxPressure;
245 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
247 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
248 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
249 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
250 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
251 errs() << "Register Pressure is inaccurate when calculated through "
252 "PressureDiff\n"
253 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
254 << ", expected "
255 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
256 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
257 << ", expected "
258 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
259 report_fatal_error("inaccurate register pressure calculation");
260 }
261#endif
262 }
263
264 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
265 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
266
267 // If two instructions increase the pressure of different register sets
268 // by the same amount, the generic scheduler will prefer to schedule the
269 // instruction that increases the set with the least amount of registers,
270 // which in our case would be SGPRs. This is rarely what we want, so
271 // when we report excess/critical register pressure, we do it either
272 // only for VGPRs or only for SGPRs.
273
274 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
275 const unsigned MaxVGPRPressureInc = 16;
276 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
277 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
278
279 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
280 // to increase the likelihood we don't go over the limits. We should improve
281 // the analysis to look through dependencies to find the path with the least
282 // register pressure.
283
284 // We only need to update the RPDelta for instructions that increase register
285 // pressure. Instructions that decrease or keep reg pressure the same will be
286 // marked as RegExcess in tryCandidate() when they are compared with
287 // instructions that increase the register pressure.
288 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
289 HasHighPressure = true;
290 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
291 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
292 }
293
294 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
295 HasHighPressure = true;
296 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
297 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
298 }
299
300 // Register pressure is considered 'CRITICAL' if it is approaching a value
301 // that would reduce the wave occupancy for the execution unit. When
302 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
303 // has the same cost, so we don't need to prefer one over the other.
304
305 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
306 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
307
308 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
309 HasHighPressure = true;
310 if (SGPRDelta > VGPRDelta) {
311 Cand.RPDelta.CriticalMax =
312 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
313 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
314 } else {
315 Cand.RPDelta.CriticalMax =
316 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
317 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
318 }
319 }
320}
321
322// This function is mostly cut and pasted from
323// GenericScheduler::pickNodeFromQueue()
325 const CandPolicy &ZonePolicy,
326 const RegPressureTracker &RPTracker,
327 SchedCandidate &Cand,
328 bool IsBottomUp) {
329 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
331 unsigned SGPRPressure = 0;
332 unsigned VGPRPressure = 0;
333 if (DAG->isTrackingPressure()) {
334 if (!GCNTrackers) {
335 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
336 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
337 } else {
338 GCNRPTracker *T = IsBottomUp
339 ? static_cast<GCNRPTracker *>(&UpwardTracker)
340 : static_cast<GCNRPTracker *>(&DownwardTracker);
341 SGPRPressure = T->getPressure().getSGPRNum();
342 VGPRPressure = T->getPressure().getArchVGPRNum();
343 }
344 }
345 ReadyQueue &Q = Zone.Available;
346 for (SUnit *SU : Q) {
347
348 SchedCandidate TryCand(ZonePolicy);
349 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
350 VGPRPressure, IsBottomUp);
351 // Pass SchedBoundary only when comparing nodes from the same boundary.
352 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
353 tryCandidate(Cand, TryCand, ZoneArg);
354 if (TryCand.Reason != NoCand) {
355 // Initialize resource delta if needed in case future heuristics query it.
356 if (TryCand.ResDelta == SchedResourceDelta())
357 TryCand.initResourceDelta(Zone.DAG, SchedModel);
358 Cand.setBest(TryCand);
360 }
361 }
362}
363
364// This function is mostly cut and pasted from
365// GenericScheduler::pickNodeBidirectional()
367 // Schedule as far as possible in the direction of no choice. This is most
368 // efficient, but also provides the best heuristics for CriticalPSets.
369 if (SUnit *SU = Bot.pickOnlyChoice()) {
370 IsTopNode = false;
371 return SU;
372 }
373 if (SUnit *SU = Top.pickOnlyChoice()) {
374 IsTopNode = true;
375 return SU;
376 }
377 // Set the bottom-up policy based on the state of the current bottom zone and
378 // the instructions outside the zone, including the top zone.
379 CandPolicy BotPolicy;
380 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
381 // Set the top-down policy based on the state of the current top zone and
382 // the instructions outside the zone, including the bottom zone.
383 CandPolicy TopPolicy;
384 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
385
386 // See if BotCand is still valid (because we previously scheduled from Top).
387 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
388 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
389 BotCand.Policy != BotPolicy) {
392 /*IsBottomUp=*/true);
393 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
394 } else {
396#ifndef NDEBUG
397 if (VerifyScheduling) {
398 SchedCandidate TCand;
399 TCand.reset(CandPolicy());
400 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
401 /*IsBottomUp=*/true);
402 assert(TCand.SU == BotCand.SU &&
403 "Last pick result should correspond to re-picking right now");
404 }
405#endif
406 }
407
408 // Check if the top Q has a better candidate.
409 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
410 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
411 TopCand.Policy != TopPolicy) {
414 /*IsBottomUp=*/false);
415 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
416 } else {
418#ifndef NDEBUG
419 if (VerifyScheduling) {
420 SchedCandidate TCand;
421 TCand.reset(CandPolicy());
422 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
423 /*IsBottomUp=*/false);
424 assert(TCand.SU == TopCand.SU &&
425 "Last pick result should correspond to re-picking right now");
426 }
427#endif
428 }
429
430 // Pick best from BotCand and TopCand.
431 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
432 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
433 SchedCandidate Cand = BotCand;
435 tryCandidate(Cand, TopCand, nullptr);
436 if (TopCand.Reason != NoCand) {
437 Cand.setBest(TopCand);
438 }
439 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
440
441 IsTopNode = Cand.AtTop;
442 return Cand.SU;
443}
444
445// This function is mostly cut and pasted from
446// GenericScheduler::pickNode()
448 if (DAG->top() == DAG->bottom()) {
450 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451 return nullptr;
452 }
453 SUnit *SU;
454 do {
456 SU = Top.pickOnlyChoice();
457 if (!SU) {
458 CandPolicy NoPolicy;
459 TopCand.reset(NoPolicy);
461 /*IsBottomUp=*/false);
462 assert(TopCand.Reason != NoCand && "failed to find a candidate");
463 SU = TopCand.SU;
464 }
465 IsTopNode = true;
466 } else if (RegionPolicy.OnlyBottomUp) {
467 SU = Bot.pickOnlyChoice();
468 if (!SU) {
469 CandPolicy NoPolicy;
470 BotCand.reset(NoPolicy);
472 /*IsBottomUp=*/true);
473 assert(BotCand.Reason != NoCand && "failed to find a candidate");
474 SU = BotCand.SU;
475 }
476 IsTopNode = false;
477 } else {
478 SU = pickNodeBidirectional(IsTopNode);
479 }
480 } while (SU->isScheduled);
481
482 if (SU->isTopReady())
483 Top.removeReady(SU);
484 if (SU->isBottomReady())
485 Bot.removeReady(SU);
486
487 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
488 << *SU->getInstr());
489 return SU;
490}
491
492void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
493 if (GCNTrackers) {
494 MachineInstr *MI = SU->getInstr();
495 IsTopNode ? (void)DownwardTracker.advance(MI, false)
497 }
498
499 return GenericScheduler::schedNode(SU, IsTopNode);
500}
501
504 return *CurrentStage;
505}
506
509 if (!CurrentStage)
511 else
512 CurrentStage++;
513
514 return CurrentStage != SchedStages.end();
515}
516
519 return std::next(CurrentStage) != SchedStages.end();
520}
521
523 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
524 return *std::next(CurrentStage);
525}
526
528 const MachineSchedContext *C, bool IsLegacyScheduler)
534 GCNTrackers = GCNTrackers & !IsLegacyScheduler;
535}
536
540}
541
543 SchedCandidate &TryCand,
544 SchedBoundary *Zone) const {
545 // Initialize the candidate if needed.
546 if (!Cand.isValid()) {
547 TryCand.Reason = NodeOrder;
548 return true;
549 }
550
551 // Avoid spilling by exceeding the register limit.
552 if (DAG->isTrackingPressure() &&
553 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
554 RegExcess, TRI, DAG->MF))
555 return TryCand.Reason != NoCand;
556
557 // Bias PhysReg Defs and copies to their uses and defined respectively.
558 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
559 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
560 return TryCand.Reason != NoCand;
561
562 bool SameBoundary = Zone != nullptr;
563 if (SameBoundary) {
564 // Prioritize instructions that read unbuffered resources by stall cycles.
565 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
566 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
567 return TryCand.Reason != NoCand;
568
569 // Avoid critical resource consumption and balance the schedule.
572 TryCand, Cand, ResourceReduce))
573 return TryCand.Reason != NoCand;
575 Cand.ResDelta.DemandedResources, TryCand, Cand,
577 return TryCand.Reason != NoCand;
578
579 // Unconditionally try to reduce latency.
580 if (tryLatency(TryCand, Cand, *Zone))
581 return TryCand.Reason != NoCand;
582
583 // Weak edges are for clustering and other constraints.
584 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
585 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
586 return TryCand.Reason != NoCand;
587 }
588
589 // Keep clustered nodes together to encourage downstream peephole
590 // optimizations which may reduce resource requirements.
591 //
592 // This is a best effort to set things up for a post-RA pass. Optimizations
593 // like generating loads of multiple registers should ideally be done within
594 // the scheduler pass by combining the loads during DAG postprocessing.
595 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
596 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
597 bool CandIsClusterSucc =
598 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
599 bool TryCandIsClusterSucc =
600 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
601 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
602 Cluster))
603 return TryCand.Reason != NoCand;
604
605 // Avoid increasing the max critical pressure in the scheduled region.
606 if (DAG->isTrackingPressure() &&
608 TryCand, Cand, RegCritical, TRI, DAG->MF))
609 return TryCand.Reason != NoCand;
610
611 // Avoid increasing the max pressure of the entire region.
612 if (DAG->isTrackingPressure() &&
613 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
614 Cand, RegMax, TRI, DAG->MF))
615 return TryCand.Reason != NoCand;
616
617 if (SameBoundary) {
618 // Fall through to original instruction order.
619 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
620 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
621 TryCand.Reason = NodeOrder;
622 return true;
623 }
624 }
625 return false;
626}
627
629 const MachineSchedContext *C)
632}
633
634/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
635/// much as possible. This is achieved by:
636// 1. Prioritize clustered operations before stall latency heuristic.
637// 2. Prioritize long-latency-load before stall latency heuristic.
638///
639/// \param Cand provides the policy and current best candidate.
640/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
641/// \param Zone describes the scheduled zone that we are extending, or nullptr
642/// if Cand is from a different zone than TryCand.
643/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
645 SchedCandidate &TryCand,
646 SchedBoundary *Zone) const {
647 // Initialize the candidate if needed.
648 if (!Cand.isValid()) {
649 TryCand.Reason = NodeOrder;
650 return true;
651 }
652
653 // Bias PhysReg Defs and copies to their uses and defined respectively.
654 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
655 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
656 return TryCand.Reason != NoCand;
657
658 if (DAG->isTrackingPressure()) {
659 // Avoid exceeding the target's limit.
660 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
661 RegExcess, TRI, DAG->MF))
662 return TryCand.Reason != NoCand;
663
664 // Avoid increasing the max critical pressure in the scheduled region.
666 TryCand, Cand, RegCritical, TRI, DAG->MF))
667 return TryCand.Reason != NoCand;
668 }
669
670 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
671 // get more benefit from clausing these memory instructions.
672 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
673 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
674 bool CandIsClusterSucc =
675 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
676 bool TryCandIsClusterSucc =
677 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
678 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
679 Cluster))
680 return TryCand.Reason != NoCand;
681
682 // We only compare a subset of features when comparing nodes between
683 // Top and Bottom boundary. Some properties are simply incomparable, in many
684 // other instances we should only override the other boundary if something
685 // is a clear good pick on one boundary. Skip heuristics that are more
686 // "tie-breaking" in nature.
687 bool SameBoundary = Zone != nullptr;
688 if (SameBoundary) {
689 // For loops that are acyclic path limited, aggressively schedule for
690 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
691 // heuristics to take precedence.
692 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
693 tryLatency(TryCand, Cand, *Zone))
694 return TryCand.Reason != NoCand;
695
696 // MaxMemoryClause-specific: Prioritize long latency memory load
697 // instructions in top-bottom order to hide more latency. The mayLoad check
698 // is used to exclude store-like instructions, which we do not want to
699 // scheduler them too early.
700 bool TryMayLoad =
701 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
702 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
703
704 if (TryMayLoad || CandMayLoad) {
705 bool TryLongLatency =
706 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
707 bool CandLongLatency =
708 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
709
710 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
711 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
712 Cand, Stall))
713 return TryCand.Reason != NoCand;
714 }
715 // Prioritize instructions that read unbuffered resources by stall cycles.
716 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
717 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
718 return TryCand.Reason != NoCand;
719 }
720
721 if (SameBoundary) {
722 // Weak edges are for clustering and other constraints.
723 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
724 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
725 return TryCand.Reason != NoCand;
726 }
727
728 // Avoid increasing the max pressure of the entire region.
729 if (DAG->isTrackingPressure() &&
730 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
731 Cand, RegMax, TRI, DAG->MF))
732 return TryCand.Reason != NoCand;
733
734 if (SameBoundary) {
735 // Avoid critical resource consumption and balance the schedule.
738 TryCand, Cand, ResourceReduce))
739 return TryCand.Reason != NoCand;
741 Cand.ResDelta.DemandedResources, TryCand, Cand,
743 return TryCand.Reason != NoCand;
744
745 // Avoid serializing long latency dependence chains.
746 // For acyclic path limited loops, latency was already checked above.
748 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
749 return TryCand.Reason != NoCand;
750
751 // Fall through to original instruction order.
752 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
753 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
754 TryCand.Reason = NodeOrder;
755 return true;
756 }
757 }
758
759 return false;
760}
761
763 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
764 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
765 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
766 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
767 RegionLiveOuts(this, /*IsLiveOut=*/true) {
768
769 // We want regions with a single MI to be scheduled so that we can reason
770 // about them correctly during scheduling stages that move MIs between regions
771 // (e.g., rematerialization).
773 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
774 if (RelaxedOcc) {
775 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
776 if (MinOccupancy != StartingOccupancy)
777 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
778 << ".\n");
779 }
780}
781
782std::unique_ptr<GCNSchedStage>
783GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
784 switch (SchedStageID) {
786 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
788 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
790 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
792 return std::make_unique<PreRARematStage>(SchedStageID, *this);
794 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
796 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
797 *this);
798 }
799
800 llvm_unreachable("Unknown SchedStageID.");
801}
802
804 // Collect all scheduling regions. The actual scheduling is performed in
805 // GCNScheduleDAGMILive::finalizeSchedule.
806 Regions.push_back(std::pair(RegionBegin, RegionEnd));
807}
808
810GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
812 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
813 &LiveIns[RegionIdx]);
814 return RPTracker.moveMaxPressure();
815}
816
818 MachineBasicBlock::iterator RegionEnd) {
819 auto REnd = RegionEnd == RegionBegin->getParent()->end()
820 ? std::prev(RegionEnd)
821 : RegionEnd;
822 return &*skipDebugInstructionsBackward(REnd, RegionBegin);
823}
824
825void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
826 const MachineBasicBlock *MBB) {
828
829 // If the block has the only successor then live-ins of that successor are
830 // live-outs of the current block. We can reuse calculated live set if the
831 // successor will be sent to scheduling past current block.
832
833 // However, due to the bug in LiveInterval analysis it may happen that two
834 // predecessors of the same successor block have different lane bitmasks for
835 // a live-out register. Workaround that by sticking to one-to-one relationship
836 // i.e. one predecessor with one successor block.
837 const MachineBasicBlock *OnlySucc = nullptr;
838 if (MBB->succ_size() == 1) {
839 auto *Candidate = *MBB->succ_begin();
840 if (!Candidate->empty() && Candidate->pred_size() == 1) {
842 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
843 OnlySucc = Candidate;
844 }
845 }
846
847 // Scheduler sends regions from the end of the block upwards.
848 size_t CurRegion = RegionIdx;
849 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
850 if (Regions[CurRegion].first->getParent() != MBB)
851 break;
852 --CurRegion;
853
854 auto I = MBB->begin();
855 auto LiveInIt = MBBLiveIns.find(MBB);
856 auto &Rgn = Regions[CurRegion];
857 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
858 if (LiveInIt != MBBLiveIns.end()) {
859 auto LiveIn = std::move(LiveInIt->second);
860 RPTracker.reset(*MBB->begin(), &LiveIn);
861 MBBLiveIns.erase(LiveInIt);
862 } else {
863 I = Rgn.first;
864 auto LRS = BBLiveInMap.lookup(NonDbgMI);
865#ifdef EXPENSIVE_CHECKS
866 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
867#endif
868 RPTracker.reset(*I, &LRS);
869 }
870
871 for (;;) {
872 I = RPTracker.getNext();
873
874 if (Regions[CurRegion].first == I || NonDbgMI == I) {
875 LiveIns[CurRegion] = RPTracker.getLiveRegs();
876 RPTracker.clearMaxPressure();
877 }
878
879 if (Regions[CurRegion].second == I) {
880 Pressure[CurRegion] = RPTracker.moveMaxPressure();
881 if (CurRegion-- == RegionIdx)
882 break;
883 auto &Rgn = Regions[CurRegion];
884 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
885 }
886 RPTracker.advanceToNext();
887 RPTracker.advanceBeforeNext();
888 }
889
890 if (OnlySucc) {
891 if (I != MBB->end()) {
892 RPTracker.advanceToNext();
894 }
895 RPTracker.advanceBeforeNext();
896 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
897 }
898}
899
901GCNScheduleDAGMILive::getRegionLiveInMap() const {
902 assert(!Regions.empty());
903 std::vector<MachineInstr *> RegionFirstMIs;
904 RegionFirstMIs.reserve(Regions.size());
905 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
906 RegionFirstMIs.push_back(
908
909 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
910}
911
913GCNScheduleDAGMILive::getRegionLiveOutMap() const {
914 assert(!Regions.empty());
915 std::vector<MachineInstr *> RegionLastMIs;
916 RegionLastMIs.reserve(Regions.size());
917 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
918 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
919
920 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
921}
922
924 IdxToInstruction.clear();
925
926 RegionLiveRegMap =
927 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
928 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
929 MachineInstr *RegionKey =
930 IsLiveOut
931 ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
932 : &*DAG->Regions[I].first;
933 IdxToInstruction[I] = RegionKey;
934 }
935}
936
938 // Start actual scheduling here. This function is called by the base
939 // MachineScheduler after all regions have been recorded by
940 // GCNScheduleDAGMILive::schedule().
941 LiveIns.resize(Regions.size());
942 Pressure.resize(Regions.size());
943 RegionsWithHighRP.resize(Regions.size());
944 RegionsWithExcessRP.resize(Regions.size());
945 RegionsWithIGLPInstrs.resize(Regions.size());
946 RegionsWithHighRP.reset();
947 RegionsWithExcessRP.reset();
948 RegionsWithIGLPInstrs.reset();
949
950 runSchedStages();
951}
952
953void GCNScheduleDAGMILive::runSchedStages() {
954 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
955
956 if (!Regions.empty()) {
957 BBLiveInMap = getRegionLiveInMap();
958 if (GCNTrackers)
959 RegionLiveOuts.buildLiveRegMap();
960 }
961
962 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
963 while (S.advanceStage()) {
964 auto Stage = createSchedStage(S.getCurrentStage());
965 if (!Stage->initGCNSchedStage())
966 continue;
967
968 for (auto Region : Regions) {
969 RegionBegin = Region.first;
970 RegionEnd = Region.second;
971 // Setup for scheduling the region and check whether it should be skipped.
972 if (!Stage->initGCNRegion()) {
973 Stage->advanceRegion();
974 exitRegion();
975 continue;
976 }
977
978 if (GCNTrackers) {
979 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
980 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
981 GCNRPTracker::LiveRegSet *RegionLiveIns =
982 &LiveIns[Stage->getRegionIdx()];
983
984 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
985 ->reset(MRI, *RegionLiveIns);
986 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
987 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
988 Stage->getRegionIdx()));
989 }
990
992 Stage->finalizeGCNRegion();
993 }
994
995 Stage->finalizeGCNSchedStage();
996 }
997}
998
999#ifndef NDEBUG
1001 switch (StageID) {
1003 OS << "Max Occupancy Initial Schedule";
1004 break;
1006 OS << "Unclustered High Register Pressure Reschedule";
1007 break;
1009 OS << "Clustered Low Occupancy Reschedule";
1010 break;
1012 OS << "Pre-RA Rematerialize";
1013 break;
1015 OS << "Max ILP Initial Schedule";
1016 break;
1018 OS << "Max memory clause Initial Schedule";
1019 break;
1020 }
1021
1022 return OS;
1023}
1024#endif
1025
1027 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1028 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1029
1031 if (!DAG.LIS)
1032 return false;
1033
1034 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1035 return true;
1036}
1037
1040 return false;
1041
1043 return false;
1044
1045 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1046 return false;
1047
1051
1052 InitialOccupancy = DAG.MinOccupancy;
1053 // Aggressivly try to reduce register pressure in the unclustered high RP
1054 // stage. Temporarily increase occupancy target in the region.
1057 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1058 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1059
1060 LLVM_DEBUG(
1061 dbgs()
1062 << "Retrying function scheduling without clustering. "
1063 "Aggressivly try to reduce register pressure to achieve occupancy "
1064 << DAG.MinOccupancy << ".\n");
1065
1066 return true;
1067}
1068
1071 return false;
1072
1074 return false;
1075
1076 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1077 // been dropped. All regions will have already been scheduled with the ideal
1078 // occupancy targets.
1079 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1080 return false;
1081
1082 LLVM_DEBUG(
1083 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1084 << DAG.MinOccupancy << ".\n");
1085 return true;
1086}
1087
1088/// Allows to easily filter for this stage's debug output.
1089#define REMAT_PREFIX "[PreRARemat] "
1090#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1091
1093 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1094 // regions inbetween the defs and region we sinked the def to. Will need to be
1095 // fixed if there is another pass after this pass.
1096 assert(!S.hasNextStage());
1097
1098 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
1099 return false;
1100
1101 // Before performing any IR modification record the parent region of each MI
1102 // and the parent MBB of each region.
1103 const unsigned NumRegions = DAG.Regions.size();
1104 RegionBB.reserve(NumRegions);
1105 for (unsigned I = 0; I < NumRegions; ++I) {
1106 RegionBoundaries Region = DAG.Regions[I];
1107 for (auto MI = Region.first; MI != Region.second; ++MI)
1108 MIRegion.insert({&*MI, I});
1109 RegionBB.push_back(Region.first->getParent());
1110 }
1111
1112 if (!canIncreaseOccupancyOrReduceSpill())
1113 return false;
1114
1115 // Rematerialize identified instructions and update scheduler's state.
1116 rematerialize();
1117 if (GCNTrackers)
1118 DAG.RegionLiveOuts.buildLiveRegMap();
1119 REMAT_DEBUG({
1120 dbgs() << "Retrying function scheduling with new min. occupancy of "
1121 << AchievedOcc << " from rematerializing (original was "
1122 << DAG.MinOccupancy;
1123 if (TargetOcc)
1124 dbgs() << ", target was " << *TargetOcc;
1125 dbgs() << ")\n";
1126 });
1127
1128 if (AchievedOcc > DAG.MinOccupancy) {
1129 DAG.MinOccupancy = AchievedOcc;
1131 MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1132 }
1133 return true;
1134}
1135
1137 DAG.finishBlock();
1138 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1139}
1140
1144 if (DAG.MinOccupancy > InitialOccupancy) {
1146 << " stage successfully increased occupancy to "
1147 << DAG.MinOccupancy << '\n');
1148 }
1149
1151}
1152
1154 // Check whether this new region is also a new block.
1155 if (DAG.RegionBegin->getParent() != CurrentMBB)
1156 setupNewBlock();
1157
1158 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1159 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1160
1161 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1162 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1163 return false;
1164
1165 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1167 << " " << CurrentMBB->getName()
1168 << "\n From: " << *DAG.begin() << " To: ";
1170 else dbgs() << "End";
1171 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1172
1173 // Save original instruction order before scheduling for possible revert.
1174 Unsched.clear();
1175 Unsched.reserve(DAG.NumRegionInstrs);
1178 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1179 for (auto &I : DAG) {
1180 Unsched.push_back(&I);
1181 if (SII->isIGLPMutationOnly(I.getOpcode()))
1182 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1183 }
1184 } else {
1185 for (auto &I : DAG)
1186 Unsched.push_back(&I);
1187 }
1188
1189 PressureBefore = DAG.Pressure[RegionIdx];
1190
1191 LLVM_DEBUG(
1192 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1193 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1194 << "Region live-in pressure: "
1196 << "Region register pressure: " << print(PressureBefore));
1197
1198 S.HasHighPressure = false;
1200
1201 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1203 SavedMutations.clear();
1205 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1208 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1210 }
1211
1212 return true;
1213}
1214
1216 // Only reschedule regions that have excess register pressure (i.e. spilling)
1217 // or had minimum occupancy at the beginning of the stage (as long as
1218 // rescheduling of previous regions did not make occupancy drop back down to
1219 // the initial minimum).
1220 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1221 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1222 (DAG.MinOccupancy <= InitialOccupancy ||
1223 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1224 InitialOccupancy))
1225 return false;
1226
1228}
1229
1231 // We may need to reschedule this region if it wasn't rescheduled in the last
1232 // stage, or if we found it was testing critical register pressure limits in
1233 // the unclustered reschedule stage. The later is because we may not have been
1234 // able to raise the min occupancy in the previous stage so the region may be
1235 // overly constrained even if it was already rescheduled.
1236 if (!DAG.RegionsWithHighRP[RegionIdx])
1237 return false;
1238
1240}
1241
1243 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1244}
1245
1247 if (CurrentMBB)
1248 DAG.finishBlock();
1249
1250 CurrentMBB = DAG.RegionBegin->getParent();
1252 // Get real RP for the region if it hasn't be calculated before. After the
1253 // initial schedule stage real RP will be collected after scheduling.
1257 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1258}
1259
1261 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1262 if (S.HasHighPressure)
1263 DAG.RegionsWithHighRP[RegionIdx] = true;
1264
1265 // Revert scheduling if we have dropped occupancy or there is some other
1266 // reason that the original schedule is better.
1268
1269 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1272
1273 DAG.exitRegion();
1274 advanceRegion();
1275}
1276
1278 // Check the results of scheduling.
1279 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1280
1281 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1282 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1283
1284 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1285
1288 DAG.Pressure[RegionIdx] = PressureAfter;
1289
1290 // Early out if we have achieved the occupancy target.
1291 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1292 return;
1293 }
1294
1295 unsigned TargetOccupancy = std::min(
1297 unsigned WavesAfter = std::min(
1298 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1299 unsigned WavesBefore = std::min(
1300 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1301 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1302 << ", after " << WavesAfter << ".\n");
1303
1304 // We may not be able to keep the current target occupancy because of the just
1305 // scheduled region. We might still be able to revert scheduling if the
1306 // occupancy before was higher, or if the current schedule has register
1307 // pressure higher than the excess limits which could lead to more spilling.
1308 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1309
1310 // Allow memory bound functions to drop to 4 waves if not limited by an
1311 // attribute.
1312 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1313 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1314 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1315 << MFI.getMinAllowedOccupancy() << " waves\n");
1316 NewOccupancy = WavesAfter;
1317 }
1318
1319 if (NewOccupancy < DAG.MinOccupancy) {
1320 DAG.MinOccupancy = NewOccupancy;
1321 MFI.limitOccupancy(DAG.MinOccupancy);
1322 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1323 << DAG.MinOccupancy << ".\n");
1324 }
1325 // The maximum number of arch VGPR on non-unified register file, or the
1326 // maximum VGPR + AGPR in the unified register file case.
1327 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1328 // The maximum number of arch VGPR for both unified and non-unified register
1329 // file.
1330 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1331 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1332
1333 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1334 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1335 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1336 PressureAfter.getSGPRNum() > MaxSGPRs) {
1337 DAG.RegionsWithHighRP[RegionIdx] = true;
1338 DAG.RegionsWithExcessRP[RegionIdx] = true;
1339 }
1340
1341 // Revert if this region's schedule would cause a drop in occupancy or
1342 // spilling.
1343 if (shouldRevertScheduling(WavesAfter))
1345 else
1346 DAG.Pressure[RegionIdx] = PressureAfter;
1347}
1348
1349unsigned
1350GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1351 DenseMap<unsigned, unsigned> &ReadyCycles,
1352 const TargetSchedModel &SM) {
1353 unsigned ReadyCycle = CurrCycle;
1354 for (auto &D : SU.Preds) {
1355 if (D.isAssignedRegDep()) {
1356 MachineInstr *DefMI = D.getSUnit()->getInstr();
1357 unsigned Latency = SM.computeInstrLatency(DefMI);
1358 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1359 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1360 }
1361 }
1362 ReadyCycles[SU.NodeNum] = ReadyCycle;
1363 return ReadyCycle;
1364}
1365
1366#ifndef NDEBUG
1368 bool operator()(std::pair<MachineInstr *, unsigned> A,
1369 std::pair<MachineInstr *, unsigned> B) const {
1370 return A.second < B.second;
1371 }
1372};
1373
1374static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1375 EarlierIssuingCycle> &ReadyCycles) {
1376 if (ReadyCycles.empty())
1377 return;
1378 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1379 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1380 << " ##################\n# Cycle #\t\t\tInstruction "
1381 " "
1382 " \n";
1383 unsigned IPrev = 1;
1384 for (auto &I : ReadyCycles) {
1385 if (I.second > IPrev + 1)
1386 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1387 << " CYCLES DETECTED ******************************\n\n";
1388 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1389 IPrev = I.second;
1390 }
1391}
1392#endif
1393
1395GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1396#ifndef NDEBUG
1397 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1398 ReadyCyclesSorted;
1399#endif
1401 unsigned SumBubbles = 0;
1402 DenseMap<unsigned, unsigned> ReadyCycles;
1403 unsigned CurrCycle = 0;
1404 for (auto &SU : InputSchedule) {
1405 unsigned ReadyCycle =
1406 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1407 SumBubbles += ReadyCycle - CurrCycle;
1408#ifndef NDEBUG
1409 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1410#endif
1411 CurrCycle = ++ReadyCycle;
1412 }
1413#ifndef NDEBUG
1414 LLVM_DEBUG(
1415 printScheduleModel(ReadyCyclesSorted);
1416 dbgs() << "\n\t"
1417 << "Metric: "
1418 << (SumBubbles
1419 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1420 : 1)
1421 << "\n\n");
1422#endif
1423
1424 return ScheduleMetrics(CurrCycle, SumBubbles);
1425}
1426
1429#ifndef NDEBUG
1430 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1431 ReadyCyclesSorted;
1432#endif
1434 unsigned SumBubbles = 0;
1435 DenseMap<unsigned, unsigned> ReadyCycles;
1436 unsigned CurrCycle = 0;
1437 for (auto &MI : DAG) {
1438 SUnit *SU = DAG.getSUnit(&MI);
1439 if (!SU)
1440 continue;
1441 unsigned ReadyCycle =
1442 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1443 SumBubbles += ReadyCycle - CurrCycle;
1444#ifndef NDEBUG
1445 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1446#endif
1447 CurrCycle = ++ReadyCycle;
1448 }
1449#ifndef NDEBUG
1450 LLVM_DEBUG(
1451 printScheduleModel(ReadyCyclesSorted);
1452 dbgs() << "\n\t"
1453 << "Metric: "
1454 << (SumBubbles
1455 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1456 : 1)
1457 << "\n\n");
1458#endif
1459
1460 return ScheduleMetrics(CurrCycle, SumBubbles);
1461}
1462
1463bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1464 if (WavesAfter < DAG.MinOccupancy)
1465 return true;
1466
1467 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1468 if (DAG.MFI.isDynamicVGPREnabled()) {
1469 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1471 PressureBefore.getVGPRNum(false));
1472 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1474 PressureAfter.getVGPRNum(false));
1475 if (BlocksAfter > BlocksBefore)
1476 return true;
1477 }
1478
1479 return false;
1480}
1481
1484 return false;
1485
1487 return true;
1488
1489 if (mayCauseSpilling(WavesAfter))
1490 return true;
1491
1492 return false;
1493}
1494
1496 // If RP is not reduced in the unclustered reschedule stage, revert to the
1497 // old schedule.
1498 if ((WavesAfter <=
1500 mayCauseSpilling(WavesAfter)) ||
1502 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1503 return true;
1504 }
1505
1506 // Do not attempt to relax schedule even more if we are already spilling.
1508 return false;
1509
1510 LLVM_DEBUG(
1511 dbgs()
1512 << "\n\t *** In shouldRevertScheduling ***\n"
1513 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1515 LLVM_DEBUG(
1516 dbgs()
1517 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1519 unsigned OldMetric = MBefore.getMetric();
1520 unsigned NewMetric = MAfter.getMetric();
1521 unsigned WavesBefore = std::min(
1524 unsigned Profit =
1525 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1527 NewMetric) /
1529 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1530 << MAfter << "Profit: " << Profit << "\n");
1531 return Profit < ScheduleMetrics::ScaleFactor;
1532}
1533
1536 return false;
1537
1539 return true;
1540
1541 if (mayCauseSpilling(WavesAfter))
1542 return true;
1543
1544 return false;
1545}
1546
1548 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1549 mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1550}
1551
1553 if (mayCauseSpilling(WavesAfter))
1554 return true;
1555
1556 return false;
1557}
1558
1560 unsigned WavesAfter) {
1561 return mayCauseSpilling(WavesAfter);
1562}
1563
1564bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1565 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1567 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1568 return true;
1569 }
1570
1571 return false;
1572}
1573
1575 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1577 int SkippedDebugInstr = 0;
1578 for (MachineInstr *MI : Unsched) {
1579 if (MI->isDebugInstr()) {
1580 ++SkippedDebugInstr;
1581 continue;
1582 }
1583
1584 if (MI->getIterator() != DAG.RegionEnd) {
1586 if (!MI->isDebugInstr())
1587 DAG.LIS->handleMove(*MI, true);
1588 }
1589
1590 // Reset read-undef flags and update them later.
1591 for (auto &Op : MI->all_defs())
1592 Op.setIsUndef(false);
1593 RegisterOperands RegOpers;
1594 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1595 if (!MI->isDebugInstr()) {
1597 // Adjust liveness and add missing dead+read-undef flags.
1599 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1600 } else {
1601 // Adjust for missing dead-def flags.
1602 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1603 }
1604 }
1605 DAG.RegionEnd = MI->getIterator();
1606 ++DAG.RegionEnd;
1607 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1608 }
1609
1610 // After reverting schedule, debug instrs will now be at the end of the block
1611 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1612 // pass debug instrs to the actual end of the scheduling region.
1613 while (SkippedDebugInstr-- > 0)
1614 ++DAG.RegionEnd;
1615
1616 // If Unsched.front() instruction is a debug instruction, this will actually
1617 // shrink the region since we moved all debug instructions to the end of the
1618 // block. Find the first instruction that is not a debug instruction.
1619 DAG.RegionBegin = Unsched.front()->getIterator();
1620 if (DAG.RegionBegin->isDebugInstr()) {
1621 for (MachineInstr *MI : Unsched) {
1622 if (MI->isDebugInstr())
1623 continue;
1624 DAG.RegionBegin = MI->getIterator();
1625 break;
1626 }
1627 }
1628
1629 // Then move the debug instructions back into their correct place and set
1630 // RegionBegin and RegionEnd if needed.
1632
1633 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1634}
1635
1636bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
1637 SlotIndex OriginalIdx,
1638 SlotIndex RematIdx) const {
1639
1640 LiveIntervals *LIS = DAG.LIS;
1642 OriginalIdx = OriginalIdx.getRegSlot(true);
1643 RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
1644 for (const MachineOperand &MO : InstToRemat->operands()) {
1645 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1646 continue;
1647
1648 if (!MO.getReg().isVirtual()) {
1649 // Do not attempt to reason about PhysRegs
1650 // TODO: better analysis of PhysReg livness
1651 if (!DAG.MRI.isConstantPhysReg(MO.getReg()) &&
1652 !DAG.TII->isIgnorableUse(MO))
1653 return false;
1654
1655 // Constant PhysRegs and IgnorableUses are okay
1656 continue;
1657 }
1658
1659 LiveInterval &LI = LIS->getInterval(MO.getReg());
1660 const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
1661 assert(OVNI);
1662
1663 // Don't allow rematerialization immediately after the original def.
1664 // It would be incorrect if InstToRemat redefines the register.
1665 // See PR14098.
1666 if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
1667 return false;
1668
1669 if (OVNI != LI.getVNInfoAt(RematIdx))
1670 return false;
1671
1672 // Check that subrange is live at RematIdx.
1673 if (LI.hasSubRanges()) {
1674 const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1675 unsigned SubReg = MO.getSubReg();
1676 LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
1677 : MRI.getMaxLaneMaskForVReg(MO.getReg());
1678 for (LiveInterval::SubRange &SR : LI.subranges()) {
1679 if ((SR.LaneMask & LM).none())
1680 continue;
1681 if (!SR.liveAt(RematIdx))
1682 return false;
1683
1684 // Early exit if all used lanes are checked. No need to continue.
1685 LM &= ~SR.LaneMask;
1686 if (LM.none())
1687 break;
1688 }
1689 }
1690 }
1691 return true;
1692}
1693
1694bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1695 const Function &F = MF.getFunction();
1696
1697 // Maps optimizable regions (i.e., regions at minimum and register-limited
1698 // occupancy, or regions with spilling) to the target RP we would like to
1699 // reach.
1701 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
1702 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
1703 auto ResetTargetRegions = [&]() {
1704 OptRegions.clear();
1705 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1706 const GCNRegPressure &RP = DAG.Pressure[I];
1707 GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
1708 if (!Target.satisfied())
1709 OptRegions.insert({I, Target});
1710 }
1711 };
1712
1713 ResetTargetRegions();
1714 if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
1715 // In addition to register usage being above addressable limits, occupancy
1716 // below the minimum is considered like "spilling" as well.
1717 TargetOcc = std::nullopt;
1718 } else {
1719 // There is no spilling and room to improve occupancy; set up "increased
1720 // occupancy targets" for all regions.
1721 TargetOcc = DAG.MinOccupancy + 1;
1722 unsigned VGPRBlockSize =
1724 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
1725 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
1726 ResetTargetRegions();
1727 }
1728 REMAT_DEBUG({
1729 dbgs() << "Analyzing ";
1730 MF.getFunction().printAsOperand(dbgs(), false);
1731 dbgs() << ": ";
1732 if (OptRegions.empty()) {
1733 dbgs() << "no objective to achieve, occupancy is maximal at "
1734 << MFI.getMaxWavesPerEU();
1735 } else if (!TargetOcc) {
1736 dbgs() << "reduce spilling (minimum target occupancy is "
1737 << MFI.getMinWavesPerEU() << ')';
1738 } else {
1739 dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
1740 << TargetOcc;
1741 }
1742 dbgs() << '\n';
1743 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1744 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
1745 dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
1746 << '\n';
1747 }
1748 }
1749 });
1750 if (OptRegions.empty())
1751 return false;
1752
1753 // Accounts for a reduction in RP in an optimizable region. Returns whether we
1754 // estimate that we have identified enough rematerialization opportunities to
1755 // achieve our goal, and sets Progress to true when this particular reduction
1756 // in pressure was helpful toward that goal.
1757 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1758 bool &Progress) -> bool {
1759 GCNRPTarget &Target = OptIt->getSecond();
1760 if (!Target.isSaveBeneficial(Reg))
1761 return false;
1762 Progress = true;
1763 Target.saveReg(Reg, Mask, DAG.MRI);
1764 if (Target.satisfied())
1765 OptRegions.erase(OptIt->getFirst());
1766 return OptRegions.empty();
1767 };
1768
1769 // We need up-to-date live-out info. to query live-out register masks in
1770 // regions containing rematerializable instructions.
1771 DAG.RegionLiveOuts.buildLiveRegMap();
1772
1773 // Cache set of registers that are going to be rematerialized.
1774 DenseSet<unsigned> RematRegs;
1775
1776 // Identify rematerializable instructions in the function.
1777 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1778 auto Region = DAG.Regions[I];
1779 for (auto MI = Region.first; MI != Region.second; ++MI) {
1780 // The instruction must be trivially rematerializable.
1781 MachineInstr &DefMI = *MI;
1782 if (!isTriviallyReMaterializable(DefMI))
1783 continue;
1784
1785 // We only support rematerializing virtual registers with one definition.
1786 Register Reg = DefMI.getOperand(0).getReg();
1787 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
1788 continue;
1789
1790 // We only care to rematerialize the instruction if it has a single
1791 // non-debug user in a different region. The using MI may not belong to a
1792 // region if it is a lone region terminator.
1794 if (!UseMI)
1795 continue;
1796 auto UseRegion = MIRegion.find(UseMI);
1797 if (UseRegion != MIRegion.end() && UseRegion->second == I)
1798 continue;
1799
1800 // Do not rematerialize an instruction if it uses or is used by an
1801 // instruction that we have designated for rematerialization.
1802 // FIXME: Allow for rematerialization chains: this requires 1. updating
1803 // remat points to account for uses that are rematerialized, and 2. either
1804 // rematerializing the candidates in careful ordering, or deferring the
1805 // MBB RP walk until the entire chain has been rematerialized.
1806 if (Rematerializations.contains(UseMI) ||
1807 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
1808 return MO.isReg() && RematRegs.contains(MO.getReg());
1809 }))
1810 continue;
1811
1812 // Do not rematerialize an instruction it it uses registers that aren't
1813 // available at its use. This ensures that we are not extending any live
1814 // range while rematerializing.
1816 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1817 if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
1818 continue;
1819
1820 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1821 RematInstruction &Remat =
1822 Rematerializations.try_emplace(&DefMI, UseMI).first->second;
1823
1824 bool RematUseful = false;
1825 if (auto It = OptRegions.find(I); It != OptRegions.end()) {
1826 // Optimistically consider that moving the instruction out of its
1827 // defining region will reduce RP in the latter; this assumes that
1828 // maximum RP in the region is reached somewhere between the defining
1829 // instruction and the end of the region.
1830 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1831 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
1832 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
1833 return true;
1834 }
1835
1836 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
1837 // We are only collecting regions in which the register is a live-in
1838 // (and may be live-through).
1839 auto It = DAG.LiveIns[LIRegion].find(Reg);
1840 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
1841 continue;
1842 Remat.LiveInRegions.insert(LIRegion);
1843
1844 // Account for the reduction in RP due to the rematerialization in an
1845 // optimizable region in which the defined register is a live-in. This
1846 // is exact for live-through region but optimistic in the using region,
1847 // where RP is actually reduced only if maximum RP is reached somewhere
1848 // between the beginning of the region and the rematerializable
1849 // instruction's use.
1850 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
1851 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
1852 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
1853 RematUseful))
1854 return true;
1855 }
1856 }
1857
1858 // If the instruction is not a live-in or live-out in any optimizable
1859 // region then there is no point in rematerializing it.
1860 if (!RematUseful) {
1861 Rematerializations.pop_back();
1862 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
1863 } else {
1864 RematRegs.insert(Reg);
1865 }
1866 }
1867 }
1868
1869 if (TargetOcc) {
1870 // We were trying to increase occupancy but failed, abort the stage.
1871 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
1872 Rematerializations.clear();
1873 return false;
1874 }
1875 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
1876 return !Rematerializations.empty();
1877}
1878
1879void PreRARematStage::rematerialize() {
1880 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1881
1882 // Collect regions whose RP changes in unpredictable way; we will have to
1883 // fully recompute their RP after all rematerailizations.
1884 DenseSet<unsigned> RecomputeRP;
1885
1886 // Rematerialize all instructions.
1887 for (auto &[DefMI, Remat] : Rematerializations) {
1888 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
1890 unsigned DefRegion = MIRegion.at(DefMI);
1891
1892 // Rematerialize DefMI to its use block.
1893 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1894 AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
1895 Remat.RematMI = &*std::prev(InsertPos);
1896 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
1897
1898 // Update region boundaries in regions we sinked from (remove defining MI)
1899 // and to (insert MI rematerialized in use block). Only then we can erase
1900 // the original MI.
1901 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
1902 auto UseRegion = MIRegion.find(Remat.UseMI);
1903 if (UseRegion != MIRegion.end()) {
1904 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
1905 Remat.RematMI);
1906 }
1909
1910 // Collect all regions impacted by the rematerialization and update their
1911 // live-in/RP information.
1912 for (unsigned I : Remat.LiveInRegions) {
1913 ImpactedRegions.insert({I, DAG.Pressure[I]});
1914 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
1915
1916#ifdef EXPENSIVE_CHECKS
1917 // All uses are known to be available / live at the remat point. Thus, the
1918 // uses should already be live in to the region.
1919 for (MachineOperand &MO : DefMI->operands()) {
1920 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1921 continue;
1922
1923 Register UseReg = MO.getReg();
1924 if (!UseReg.isVirtual())
1925 continue;
1926
1928 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1929 if (LI.hasSubRanges() && MO.getSubReg())
1930 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1931
1932 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
1933 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1934 // If this register has lanes not covered by the LiveIns, be sure they
1935 // do not map to any subrange. ref:
1936 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1937 if (UncoveredLanes.any()) {
1938 assert(LI.hasSubRanges());
1939 for (LiveInterval::SubRange &SR : LI.subranges())
1940 assert((SR.LaneMask & UncoveredLanes).none());
1941 }
1942 }
1943#endif
1944
1945 // The register is no longer a live-in in all regions but the one that
1946 // contains the single use. In live-through regions, maximum register
1947 // pressure decreases predictably so we can directly update it. In the
1948 // using region, maximum RP may or may not decrease, so we will mark it
1949 // for re-computation after all materializations have taken place.
1950 LaneBitmask PrevMask = RegionLiveIns[Reg];
1951 RegionLiveIns.erase(Reg);
1952 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
1953 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
1954 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1955 else
1956 RecomputeRP.insert(I);
1957 }
1958 // RP in the region from which the instruction was rematerialized may or may
1959 // not decrease.
1960 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
1961 RecomputeRP.insert(DefRegion);
1962
1963 // Recompute live interval to reflect the register's rematerialization.
1964 Register RematReg = Remat.RematMI->getOperand(0).getReg();
1965 DAG.LIS->removeInterval(RematReg);
1967 }
1968
1969 // All regions impacted by at least one rematerialization must be rescheduled.
1970 // Maximum pressure must also be recomputed for all regions where it changed
1971 // non-predictably and checked against the target occupancy.
1972 unsigned DynamicVGPRBlockSize =
1974 AchievedOcc = MFI.getMaxWavesPerEU();
1975 for (auto &[I, OriginalRP] : ImpactedRegions) {
1976 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
1977 RescheduleRegions[I] = !IsEmptyRegion;
1978 if (!RecomputeRP.contains(I))
1979 continue;
1980
1982 if (IsEmptyRegion) {
1983 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
1984 } else {
1986 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
1987 DAG.Regions[I].second);
1988 if (NonDbgMI == DAG.Regions[I].second) {
1989 // Region is non-empty but contains only debug instructions.
1990 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
1991 } else {
1992 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
1993 RPT.advance(DAG.Regions[I].second);
1994 RP = RPT.moveMaxPressure();
1995 }
1996 }
1997 DAG.Pressure[I] = RP;
1998 AchievedOcc =
1999 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
2000 }
2001 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2002}
2003
2004// Copied from MachineLICM
2005bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
2007 return false;
2008
2009 for (const MachineOperand &MO : MI.all_uses()) {
2010 // We can't remat physreg uses, unless it is a constant or an ignorable
2011 // use (e.g. implicit exec use on VALU instructions)
2012 if (MO.getReg().isPhysical()) {
2013 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
2014 continue;
2015 return false;
2016 }
2017 }
2018
2019 return true;
2020}
2021
2022void PreRARematStage::finalizeGCNSchedStage() {
2023 // We consider that reducing spilling is always beneficial so we never
2024 // rollback rematerializations in such cases. It's also possible that
2025 // rescheduling lowers occupancy over the one achieved just through remats, in
2026 // which case we do not want to rollback either (the rescheduling was already
2027 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2028 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2029 if (!TargetOcc || MaxOcc >= *TargetOcc)
2030 return;
2031
2032 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2033 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2034
2035 // Rollback the rematerializations.
2036 for (const auto &[DefMI, Remat] : Rematerializations) {
2037 MachineInstr &RematMI = *Remat.RematMI;
2038 unsigned DefRegion = MIRegion.at(DefMI);
2039 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2040 MachineBasicBlock *MBB = RegionBB[DefRegion];
2041 Register Reg = RematMI.getOperand(0).getReg();
2042
2043 // Re-rematerialize MI at the end of its original region. Note that it may
2044 // not be rematerialized exactly in the same position as originally within
2045 // the region, but it should not matter much.
2046 TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
2047 *DAG.TRI);
2048 MachineInstr *NewMI = &*std::prev(InsertPos);
2050
2051 auto UseRegion = MIRegion.find(Remat.UseMI);
2052 if (UseRegion != MIRegion.end()) {
2053 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
2054 nullptr);
2055 }
2056 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2057
2058 // Erase rematerialized MI.
2060 RematMI.eraseFromParent();
2061
2062 // Recompute live interval for the re-rematerialized register
2063 DAG.LIS->removeInterval(Reg);
2065
2066 // Re-add the register as a live-in in all regions it used to be one in.
2067 for (unsigned LIRegion : Remat.LiveInRegions)
2068 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2069 }
2070
2071 // Reset RP in all impacted regions.
2072 for (auto &[I, OriginalRP] : ImpactedRegions)
2073 DAG.Pressure[I] = OriginalRP;
2074
2076}
2077
2078void GCNScheduleDAGMILive::updateRegionBoundaries(
2080 MachineInstr *NewMI) {
2081 assert((!NewMI || NewMI != RegionBounds.second) &&
2082 "cannot remove at region end");
2083
2084 if (RegionBounds.first == RegionBounds.second) {
2085 assert(NewMI && "cannot remove from an empty region");
2086 RegionBounds.first = NewMI;
2087 return;
2088 }
2089
2090 // We only care for modifications at the beginning of a non-empty region since
2091 // the upper region boundary is exclusive.
2092 if (MI != RegionBounds.first)
2093 return;
2094 if (!NewMI)
2095 RegionBounds.first = std::next(MI); // Removal
2096 else
2097 RegionBounds.first = NewMI; // Insertion
2098}
2099
2101 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2102 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2103 return SII->isIGLPMutationOnly(MI->getOpcode());
2104 });
2105}
2106
2108 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2109 bool RemoveKillFlags)
2110 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
2111
2113 HasIGLPInstrs = hasIGLPInstrs(this);
2114 if (HasIGLPInstrs) {
2115 SavedMutations.clear();
2116 SavedMutations.swap(Mutations);
2118 }
2119
2121}
2122
2124 if (HasIGLPInstrs)
2125 SavedMutations.swap(Mutations);
2126
2128}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define LLVM_DEBUG(...)
Definition: Debug.h:119
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
BitVector & reset()
Definition: BitVector.h:392
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
bool shouldRevertScheduling(unsigned WavesAfter) override
This class represents an Operation in the Expression.
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
bool erase(const KeyT &Val)
Definition: DenseMap.h:319
bool empty() const
Definition: DenseMap.h:119
iterator end()
Definition: DenseMap.h:87
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:221
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
bool advance(MachineInstr *MI=nullptr, bool UseInternalIterator=true)
Move to the state at the next MI.
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
GCNRegPressure getPressure() const
virtual bool initGCNRegion()
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
const unsigned HighRPSGPRBias
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
SUnit * pickNodeBidirectional(bool &IsTopNode)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool IsBottomUp)
std::vector< unsigned > MaxPressure
GCNSchedStageID getCurrentStage()
MachineFunction * MF
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
const unsigned HighRPVGPRBias
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the best node to balance the schedule. Implements MachineSchedStrategy.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
unsigned getAddressableNumArchVGPRs() const
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
GenericScheduler shrinks the unscheduled zone using heuristics to balance the schedule.
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
bool shouldRevertScheduling(unsigned WavesAfter) override
A live range for subregisters.
Definition: LiveInterval.h:697
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:690
bool hasSubRanges() const
Returns true if subregister liveness information is available.
Definition: LiveInterval.h:813
iterator_range< subrange_iterator > subranges()
Definition: LiveInterval.h:785
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndexes * getSlotIndexes() const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
Definition: LiveInterval.h:423
unsigned succ_size() const
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
Representation of each machine instruction.
Definition: MachineInstr.h:72
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
Definition: MachineInstr.h:693
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
MachineOperand class - Representation of each machine instruction operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI MachineInstr * getOneNonDBGUser(Register RegNo) const
If the register has a single non-Debug instruction using the specified register, returns it; otherwis...
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool initGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
void setUnitInc(int Inc)
Helpers for implementing custom MachineSchedStrategy classes.
bool empty() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:362
GCNRPTracker::LiveRegSet & getLiveRegsForRegionIdx(unsigned RegionIdx)
unsigned getNumAllocatableRegs(const TargetRegisterClass *RC) const
getNumAllocatableRegs - Returns the number of actually allocatable registers in RC in the current fun...
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74
bool isIGLPMutationOnly(unsigned Opcode) const
Definition: SIInstrInfo.h:1026
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1542
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void increaseOccupancy(const MachineFunction &MF, unsigned Limit)
void limitOccupancy(const MachineFunction &MF)
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:249
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:387
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:277
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:312
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:305
unsigned ParentClusterIdx
The parent cluster id.
Definition: ScheduleDAG.h:288
bool isBottomReady() const
Definition: ScheduleDAG.h:476
bool isTopReady() const
Definition: ScheduleDAG.h:473
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:269
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:399
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
ScheduleDAGMI * DAG
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
LLVM_ABI void removeReady(SUnit *SU)
Remove SU from the ready set for this boundary.
A ScheduleDAG for scheduling lists of MachineInstr.
MachineBasicBlock::iterator end() const
Returns an iterator to the bottom of the current scheduling region.
MachineBasicBlock * BB
The block in which to insert instructions.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
MachineBasicBlock::iterator begin() const
Returns an iterator to the top of the current scheduling region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
unsigned NumRegionInstrs
Instructions in this region (distance(RegionBegin, RegionEnd)).
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
PressureDiff & getPressureDiff(const SUnit *SU)
void enterRegion(MachineBasicBlock *bb, MachineBasicBlock::iterator begin, MachineBasicBlock::iterator end, unsigned regioninstrs) override
Implement the ScheduleDAGInstrs interface for handling the next scheduling region.
const RegPressureTracker & getBotRPTracker() const
bool isTrackingPressure() const
Return true if register pressure tracking is enabled.
const RegPressureTracker & getTopRPTracker() const
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void startBlock(MachineBasicBlock *bb) override
Prepares to perform scheduling in the given block.
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
MachineBasicBlock::iterator top() const
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
MachineBasicBlock::iterator bottom() const
void finishBlock() override
Cleans up after scheduling in the given block.
LiveIntervals * LIS
void placeDebugValues()
Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:587
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:584
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:588
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:585
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:586
static const unsigned ScaleFactor
unsigned getMetric() const
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
Definition: SlotIndexes.h:177
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:238
SlotIndexes pass.
Definition: SlotIndexes.h:298
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
Definition: SlotIndexes.h:461
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
void push_back(const T &Elt)
Definition: SmallVector.h:414
bool isTriviallyReMaterializable(const MachineInstr &MI) const
Return true if the instruction is trivially rematerializable, meaning it has no side effects and requ...
virtual bool isIgnorableUse(const MachineOperand &MO) const
Given MO is a PhysReg use return if it can be ignored for the purpose of instruction rematerializatio...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Provide an instruction scheduling machine model to CodeGen passes.
Target - Wrapper for Target specific information.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
Definition: LiveInterval.h:54
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
Definition: AsmWriter.cpp:5305
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
unsigned getDynamicVGPRBlockSize(const Function &F)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
LLVM_ABI cl::opt< bool > VerifyScheduling
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
Definition: ScheduleDAG.h:244
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:312
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1886
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:856
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getVGPRNum(bool UnifiedVGPRFile) const
unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
bool less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy=std::numeric_limits< unsigned >::max()) const
Compares this GCNRegpressure to O, returning true if this is less.
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool none() const
Definition: LaneBitmask.h:52
constexpr bool any() const
Definition: LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
RegisterClassInfo * RegClassInfo
PressureChange CriticalMax