LLVM 22.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
28#include "BranchFolding.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
35#include "llvm/ADT/Statistic.h"
52#include "llvm/IR/DebugLoc.h"
53#include "llvm/IR/Function.h"
54#include "llvm/IR/PrintPasses.h"
56#include "llvm/Pass.h"
63#include "llvm/Support/Debug.h"
67#include <algorithm>
68#include <cassert>
69#include <cstdint>
70#include <iterator>
71#include <memory>
72#include <string>
73#include <tuple>
74#include <utility>
75#include <vector>
76
77using namespace llvm;
78
79#define DEBUG_TYPE "block-placement"
80
81STATISTIC(NumCondBranches, "Number of conditional branches");
82STATISTIC(NumUncondBranches, "Number of unconditional branches");
83STATISTIC(CondBranchTakenFreq,
84 "Potential frequency of taking conditional branches");
85STATISTIC(UncondBranchTakenFreq,
86 "Potential frequency of taking unconditional branches");
87
89 "align-all-blocks",
90 cl::desc("Force the alignment of all blocks in the function in log2 format "
91 "(e.g 4 means align on 16B boundaries)."),
93
95 "align-all-nofallthru-blocks",
96 cl::desc("Force the alignment of all blocks that have no fall-through "
97 "predecessors (i.e. don't add nops that are executed). In log2 "
98 "format (e.g 4 means align on 16B boundaries)."),
100
102 "max-bytes-for-alignment",
103 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
104 "alignment"),
105 cl::init(0), cl::Hidden);
106
108 "block-placement-predecessor-limit",
109 cl::desc("For blocks with more predecessors, certain layout optimizations"
110 "will be disabled to prevent quadratic compile time."),
111 cl::init(1000), cl::Hidden);
112
113// FIXME: Find a good default for this flag and remove the flag.
115 "block-placement-exit-block-bias",
116 cl::desc("Block frequency percentage a loop exit block needs "
117 "over the original exit to be considered the new exit."),
118 cl::init(0), cl::Hidden);
119
120// Definition:
121// - Outlining: placement of a basic block outside the chain or hot path.
122
124 "loop-to-cold-block-ratio",
125 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
126 "(frequency of block) is greater than this ratio"),
127 cl::init(5), cl::Hidden);
128
129static cl::opt<bool>
130 ForceLoopColdBlock("force-loop-cold-block",
131 cl::desc("Force outlining cold blocks from loops."),
132 cl::init(false), cl::Hidden);
133
134static cl::opt<bool>
135 PreciseRotationCost("precise-rotation-cost",
136 cl::desc("Model the cost of loop rotation more "
137 "precisely by using profile data."),
138 cl::init(false), cl::Hidden);
139
140static cl::opt<bool>
141 ForcePreciseRotationCost("force-precise-rotation-cost",
142 cl::desc("Force the use of precise cost "
143 "loop rotation strategy."),
144 cl::init(false), cl::Hidden);
145
147 "misfetch-cost",
148 cl::desc("Cost that models the probabilistic risk of an instruction "
149 "misfetch due to a jump comparing to falling through, whose cost "
150 "is zero."),
151 cl::init(1), cl::Hidden);
152
153static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154 cl::desc("Cost of jump instructions."),
155 cl::init(1), cl::Hidden);
156static cl::opt<bool>
157 TailDupPlacement("tail-dup-placement",
158 cl::desc("Perform tail duplication during placement. "
159 "Creates more fallthrough opportunities in "
160 "outline branches."),
161 cl::init(true), cl::Hidden);
162
163static cl::opt<bool>
164 BranchFoldPlacement("branch-fold-placement",
165 cl::desc("Perform branch folding during placement. "
166 "Reduces code size."),
167 cl::init(true), cl::Hidden);
168
169// Heuristic for tail duplication.
171 "tail-dup-placement-threshold",
172 cl::desc("Instruction cutoff for tail duplication during layout. "
173 "Tail merging during layout is forced to have a threshold "
174 "that won't conflict."),
175 cl::init(2), cl::Hidden);
176
177// Heuristic for aggressive tail duplication.
179 "tail-dup-placement-aggressive-threshold",
180 cl::desc("Instruction cutoff for aggressive tail duplication during "
181 "layout. Used at -O3. Tail merging during layout is forced to "
182 "have a threshold that won't conflict."),
183 cl::init(4), cl::Hidden);
184
185// Heuristic for tail duplication.
187 "tail-dup-placement-penalty",
188 cl::desc(
189 "Cost penalty for blocks that can avoid breaking CFG by copying. "
190 "Copying can increase fallthrough, but it also increases icache "
191 "pressure. This parameter controls the penalty to account for that. "
192 "Percent as integer."),
193 cl::init(2), cl::Hidden);
194
195// Heuristic for tail duplication if profile count is used in cost model.
197 "tail-dup-profile-percent-threshold",
198 cl::desc("If profile count information is used in tail duplication cost "
199 "model, the gained fall through number from tail duplication "
200 "should be at least this percent of hot count."),
201 cl::init(50), cl::Hidden);
202
203// Heuristic for triangle chains.
205 "triangle-chain-count",
206 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
207 "triangle tail duplication heuristic to kick in. 0 to disable."),
208 cl::init(2), cl::Hidden);
209
210// Use case: When block layout is visualized after MBP pass, the basic blocks
211// are labeled in layout order; meanwhile blocks could be numbered in a
212// different order. It's hard to map between the graph and pass output.
213// With this option on, the basic blocks are renumbered in function layout
214// order. For debugging only.
216 "renumber-blocks-before-view",
217 cl::desc(
218 "If true, basic blocks are re-numbered before MBP layout is printed "
219 "into a dot graph. Only used when a function is being printed."),
220 cl::init(false), cl::Hidden);
221
223 "ext-tsp-block-placement-max-blocks",
224 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
225 "block placement."),
226 cl::init(UINT_MAX), cl::Hidden);
227
228// Apply the ext-tsp algorithm minimizing the size of a binary.
229static cl::opt<bool>
230 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
231 cl::desc("Use ext-tsp for size-aware block placement."));
232
233namespace llvm {
238
239// Internal option used to control BFI display only after MBP pass.
240// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241// -view-block-layout-with-bfi=
243
244// Command line option to specify the name of the function for CFG dump
245// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
247} // namespace llvm
248
249namespace {
250
251class BlockChain;
252
253/// Type for our function-wide basic block -> block chain mapping.
254using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
255
256/// A chain of blocks which will be laid out contiguously.
257///
258/// This is the datastructure representing a chain of consecutive blocks that
259/// are profitable to layout together in order to maximize fallthrough
260/// probabilities and code locality. We also can use a block chain to represent
261/// a sequence of basic blocks which have some external (correctness)
262/// requirement for sequential layout.
263///
264/// Chains can be built around a single basic block and can be merged to grow
265/// them. They participate in a block-to-chain mapping, which is updated
266/// automatically as chains are merged together.
267class BlockChain {
268 /// The sequence of blocks belonging to this chain.
269 ///
270 /// This is the sequence of blocks for a particular chain. These will be laid
271 /// out in-order within the function.
273
274 /// A handle to the function-wide basic block to block chain mapping.
275 ///
276 /// This is retained in each block chain to simplify the computation of child
277 /// block chains for SCC-formation and iteration. We store the edges to child
278 /// basic blocks, and map them back to their associated chains using this
279 /// structure.
280 BlockToChainMapType &BlockToChain;
281
282public:
283 /// Construct a new BlockChain.
284 ///
285 /// This builds a new block chain representing a single basic block in the
286 /// function. It also registers itself as the chain that block participates
287 /// in with the BlockToChain mapping.
288 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289 : Blocks(1, BB), BlockToChain(BlockToChain) {
290 assert(BB && "Cannot create a chain with a null basic block");
291 BlockToChain[BB] = this;
292 }
293
294 /// Iterator over blocks within the chain.
297
298 /// Beginning of blocks within the chain.
299 iterator begin() { return Blocks.begin(); }
300 const_iterator begin() const { return Blocks.begin(); }
301
302 /// End of blocks within the chain.
303 iterator end() { return Blocks.end(); }
304 const_iterator end() const { return Blocks.end(); }
305
306 bool remove(MachineBasicBlock *BB) {
307 for (iterator i = begin(); i != end(); ++i) {
308 if (*i == BB) {
309 Blocks.erase(i);
310 return true;
311 }
312 }
313 return false;
314 }
315
316 /// Merge a block chain into this one.
317 ///
318 /// This routine merges a block chain into this one. It takes care of forming
319 /// a contiguous sequence of basic blocks, updating the edge list, and
320 /// updating the block -> chain mapping. It does not free or tear down the
321 /// old chain, but the old chain's block list is no longer valid.
322 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
323 assert(BB && "Can't merge a null block.");
324 assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326 // Fast path in case we don't have a chain already.
327 if (!Chain) {
328 assert(!BlockToChain[BB] &&
329 "Passed chain is null, but BB has entry in BlockToChain.");
330 Blocks.push_back(BB);
331 BlockToChain[BB] = this;
332 return;
333 }
334
335 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336 assert(Chain->begin() != Chain->end());
337
338 // Update the incoming blocks to point to this chain, and add them to the
339 // chain structure.
340 for (MachineBasicBlock *ChainBB : *Chain) {
341 Blocks.push_back(ChainBB);
342 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343 BlockToChain[ChainBB] = this;
344 }
345 }
346
347#ifndef NDEBUG
348 /// Dump the blocks in this chain.
349 LLVM_DUMP_METHOD void dump() {
350 for (MachineBasicBlock *MBB : *this)
351 MBB->dump();
352 }
353#endif // NDEBUG
354
355 /// Count of predecessors of any block within the chain which have not
356 /// yet been scheduled. In general, we will delay scheduling this chain
357 /// until those predecessors are scheduled (or we find a sufficiently good
358 /// reason to override this heuristic.) Note that when forming loop chains,
359 /// blocks outside the loop are ignored and treated as if they were already
360 /// scheduled.
361 ///
362 /// Note: This field is reinitialized multiple times - once for each loop,
363 /// and then once for the function as a whole.
364 unsigned UnscheduledPredecessors = 0;
365};
366
367class MachineBlockPlacement {
368 /// A type for a block filter set.
370
371 /// Pair struct containing basic block and taildup profitability
372 struct BlockAndTailDupResult {
373 MachineBasicBlock *BB = nullptr;
374 bool ShouldTailDup;
375 };
376
377 /// Triple struct containing edge weight and the edge.
378 struct WeightedEdge {
379 BlockFrequency Weight;
380 MachineBasicBlock *Src = nullptr;
381 MachineBasicBlock *Dest = nullptr;
382 };
383
384 /// work lists of blocks that are ready to be laid out
387
388 /// Edges that have already been computed as optimal.
390
391 /// Machine Function
392 MachineFunction *F = nullptr;
393
394 /// A handle to the branch probability pass.
395 const MachineBranchProbabilityInfo *MBPI = nullptr;
396
397 /// A handle to the function-wide block frequency pass.
398 std::unique_ptr<MBFIWrapper> MBFI;
399
400 /// A handle to the loop info.
401 MachineLoopInfo *MLI = nullptr;
402
403 /// Preferred loop exit.
404 /// Member variable for convenience. It may be removed by duplication deep
405 /// in the call stack.
406 MachineBasicBlock *PreferredLoopExit = nullptr;
407
408 /// A handle to the target's instruction info.
409 const TargetInstrInfo *TII = nullptr;
410
411 /// A handle to the target's lowering info.
412 const TargetLoweringBase *TLI = nullptr;
413
414 /// A handle to the post dominator tree.
415 MachinePostDominatorTree *MPDT = nullptr;
416
417 ProfileSummaryInfo *PSI = nullptr;
418
419 // Tail merging is also determined based on
420 // whether structured CFG is required.
421 bool AllowTailMerge;
422
423 CodeGenOptLevel OptLevel;
424
425 /// Duplicator used to duplicate tails during placement.
426 ///
427 /// Placement decisions can open up new tail duplication opportunities, but
428 /// since tail duplication affects placement decisions of later blocks, it
429 /// must be done inline.
430 TailDuplicator TailDup;
431
432 /// Partial tail duplication threshold.
433 BlockFrequency DupThreshold;
434
435 unsigned TailDupSize;
436
437 /// True: use block profile count to compute tail duplication cost.
438 /// False: use block frequency to compute tail duplication cost.
439 bool UseProfileCount = false;
440
441 /// Allocator and owner of BlockChain structures.
442 ///
443 /// We build BlockChains lazily while processing the loop structure of
444 /// a function. To reduce malloc traffic, we allocate them using this
445 /// slab-like allocator, and destroy them after the pass completes. An
446 /// important guarantee is that this allocator produces stable pointers to
447 /// the chains.
449
450 /// Function wide BasicBlock to BlockChain mapping.
451 ///
452 /// This mapping allows efficiently moving from any given basic block to the
453 /// BlockChain it participates in, if any. We use it to, among other things,
454 /// allow implicitly defining edges between chains as the existing edges
455 /// between basic blocks.
457
458#ifndef NDEBUG
459 /// The set of basic blocks that have terminators that cannot be fully
460 /// analyzed. These basic blocks cannot be re-ordered safely by
461 /// MachineBlockPlacement, and we must preserve physical layout of these
462 /// blocks and their successors through the pass.
463 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
464#endif
465
466 /// Get block profile count or frequency according to UseProfileCount.
467 /// The return value is used to model tail duplication cost.
468 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469 if (UseProfileCount) {
470 auto Count = MBFI->getBlockProfileCount(BB);
471 if (Count)
472 return BlockFrequency(*Count);
473 else
474 return BlockFrequency(0);
475 } else
476 return MBFI->getBlockFreq(BB);
477 }
478
479 /// Scale the DupThreshold according to basic block size.
480 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481 void initTailDupThreshold();
482
483 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484 /// if the count goes to 0, add them to the appropriate work list.
485 void markChainSuccessors(const BlockChain &Chain,
486 const MachineBasicBlock *LoopHeaderBB,
487 const BlockFilterSet *BlockFilter = nullptr);
488
489 /// Decrease the UnscheduledPredecessors count for a single block, and
490 /// if the count goes to 0, add them to the appropriate work list.
491 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492 const MachineBasicBlock *LoopHeaderBB,
493 const BlockFilterSet *BlockFilter = nullptr);
494
496 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
497 const BlockFilterSet *BlockFilter,
499 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
500 BlockFilterSet *BlockFilter);
501 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
503 BlockFilterSet *BlockFilter);
504 bool repeatedlyTailDuplicateBlock(
506 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507 BlockFilterSet *BlockFilter,
508 MachineFunction::iterator &PrevUnplacedBlockIt,
509 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510 bool
511 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
512 BlockChain &Chain, BlockFilterSet *BlockFilter,
513 MachineFunction::iterator &PrevUnplacedBlockIt,
514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515 bool &DuplicatedToLPred);
516 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517 const MachineBasicBlock *Succ,
518 const BlockChain &SuccChain,
519 BranchProbability SuccProb,
520 BranchProbability RealSuccProb,
521 const BlockChain &Chain,
522 const BlockFilterSet *BlockFilter);
523 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524 const BlockChain &Chain,
525 const BlockFilterSet *BlockFilter);
527 selectBestCandidateBlock(const BlockChain &Chain,
530 getFirstUnplacedBlock(const BlockChain &PlacedChain,
531 MachineFunction::iterator &PrevUnplacedBlockIt);
533 getFirstUnplacedBlock(const BlockChain &PlacedChain,
534 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535 const BlockFilterSet *BlockFilter);
536
537 /// Add a basic block to the work list if it is appropriate.
538 ///
539 /// If the optional parameter BlockFilter is provided, only MBB
540 /// present in the set will be added to the worklist. If nullptr
541 /// is provided, no filtering occurs.
542 void fillWorkLists(const MachineBasicBlock *MBB,
543 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544 const BlockFilterSet *BlockFilter);
545
546 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547 BlockFilterSet *BlockFilter = nullptr);
548 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549 const MachineBasicBlock *OldTop);
550 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551 const BlockFilterSet &LoopBlockSet);
552 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553 const BlockFilterSet &LoopBlockSet);
554 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555 const MachineBasicBlock *OldTop,
556 const MachineBasicBlock *ExitBB,
557 const BlockFilterSet &LoopBlockSet);
558 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
559 const MachineLoop &L,
560 const BlockFilterSet &LoopBlockSet);
561 MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
562 const BlockFilterSet &LoopBlockSet);
563 MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
564 const BlockFilterSet &LoopBlockSet,
565 BlockFrequency &ExitFreq);
566 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567 void buildLoopChains(const MachineLoop &L);
568 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571 const BlockFilterSet &LoopBlockSet);
572 void buildCFGChains();
573 void optimizeBranches();
574 void alignBlocks();
575 /// Returns true if a block should be tail-duplicated to increase fallthrough
576 /// opportunities.
577 bool shouldTailDuplicate(MachineBasicBlock *BB);
578 /// Check the edge frequencies to see if tail duplication will increase
579 /// fallthroughs.
580 bool isProfitableToTailDup(const MachineBasicBlock *BB,
581 const MachineBasicBlock *Succ,
582 BranchProbability QProb, const BlockChain &Chain,
583 const BlockFilterSet *BlockFilter);
584
585 /// Check for a trellis layout.
586 bool isTrellis(const MachineBasicBlock *BB,
587 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590 /// Get the best successor given a trellis layout.
591 BlockAndTailDupResult getBestTrellisSuccessor(
592 const MachineBasicBlock *BB,
593 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594 BranchProbability AdjustedSumProb, const BlockChain &Chain,
595 const BlockFilterSet *BlockFilter);
596
597 /// Get the best pair of non-conflicting edges.
598 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599 const MachineBasicBlock *BB,
601
602 /// Returns true if a block can tail duplicate into all unplaced
603 /// predecessors. Filters based on loop.
604 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605 MachineBasicBlock *Succ,
606 const BlockChain &Chain,
607 const BlockFilterSet *BlockFilter);
608
609 /// Find chains of triangles to tail-duplicate where a global analysis works,
610 /// but a local analysis would not find them.
611 void precomputeTriangleChains();
612
613 /// Apply a post-processing step optimizing block placement.
614 void applyExtTsp(bool OptForSize);
615
616 /// Modify the existing block placement in the function and adjust all jumps.
617 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619 /// Create a single CFG chain from the current block order.
620 void createCFGChainExtTsp();
621
622public:
623 MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
625 std::unique_ptr<MBFIWrapper> MBFI,
626 MachinePostDominatorTree *MPDT, bool AllowTailMerge)
627 : MBPI(MBPI), MBFI(std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628 AllowTailMerge(AllowTailMerge) {};
629
630 bool run(MachineFunction &F);
631
632 static bool allowTailDupPlacement(MachineFunction &MF) {
634 }
635};
636
637class MachineBlockPlacementLegacy : public MachineFunctionPass {
638public:
639 static char ID; // Pass identification, replacement for typeid
640
641 MachineBlockPlacementLegacy() : MachineFunctionPass(ID) {
643 }
644
645 bool runOnMachineFunction(MachineFunction &MF) override {
646 if (skipFunction(MF.getFunction()))
647 return false;
648
649 auto *MBPI =
650 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
651 auto MBFI = std::make_unique<MBFIWrapper>(
652 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
653 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
654 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
655 ? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
656 .getPostDomTree()
657 : nullptr;
658 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
659 auto *PassConfig = &getAnalysis<TargetPassConfig>();
660 bool AllowTailMerge = PassConfig->getEnableTailMerge();
661 return MachineBlockPlacement(MBPI, MLI, PSI, std::move(MBFI), MPDT,
662 AllowTailMerge)
663 .run(MF);
664 }
665
666 void getAnalysisUsage(AnalysisUsage &AU) const override {
675 }
676};
677
678} // end anonymous namespace
679
680char MachineBlockPlacementLegacy::ID = 0;
681
682char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
683
684INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
685 "Branch Probability Basic Block Placement", false, false)
691INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
692 "Branch Probability Basic Block Placement", false, false)
693
694#ifndef NDEBUG
695/// Helper to print the name of a MBB.
696///
697/// Only used by debug logging.
698static std::string getBlockName(const MachineBasicBlock *BB) {
699 std::string Result;
700 raw_string_ostream OS(Result);
701 OS << printMBBReference(*BB);
702 OS << " ('" << BB->getName() << "')";
703 OS.flush();
704 return Result;
705}
706#endif
707
708/// Mark a chain's successors as having one fewer preds.
709///
710/// When a chain is being merged into the "placed" chain, this routine will
711/// quickly walk the successors of each block in the chain and mark them as
712/// having one fewer active predecessor. It also adds any successors of this
713/// chain which reach the zero-predecessor state to the appropriate worklist.
714void MachineBlockPlacement::markChainSuccessors(
715 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
716 const BlockFilterSet *BlockFilter) {
717 // Walk all the blocks in this chain, marking their successors as having
718 // a predecessor placed.
719 for (MachineBasicBlock *MBB : Chain) {
720 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
721 }
722}
723
724/// Mark a single block's successors as having one fewer preds.
725///
726/// Under normal circumstances, this is only called by markChainSuccessors,
727/// but if a block that was to be placed is completely tail-duplicated away,
728/// and was duplicated into the chain end, we need to redo markBlockSuccessors
729/// for just that block.
730void MachineBlockPlacement::markBlockSuccessors(
731 const BlockChain &Chain, const MachineBasicBlock *MBB,
732 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
733 // Add any successors for which this is the only un-placed in-loop
734 // predecessor to the worklist as a viable candidate for CFG-neutral
735 // placement. No subsequent placement of this block will violate the CFG
736 // shape, so we get to use heuristics to choose a favorable placement.
737 for (MachineBasicBlock *Succ : MBB->successors()) {
738 if (BlockFilter && !BlockFilter->count(Succ))
739 continue;
740 BlockChain &SuccChain = *BlockToChain[Succ];
741 // Disregard edges within a fixed chain, or edges to the loop header.
742 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
743 continue;
744
745 // This is a cross-chain edge that is within the loop, so decrement the
746 // loop predecessor count of the destination chain.
747 if (SuccChain.UnscheduledPredecessors == 0 ||
748 --SuccChain.UnscheduledPredecessors > 0)
749 continue;
750
751 auto *NewBB = *SuccChain.begin();
752 if (NewBB->isEHPad())
753 EHPadWorkList.push_back(NewBB);
754 else
755 BlockWorkList.push_back(NewBB);
756 }
757}
758
759/// This helper function collects the set of successors of block
760/// \p BB that are allowed to be its layout successors, and return
761/// the total branch probability of edges from \p BB to those
762/// blocks.
763BranchProbability MachineBlockPlacement::collectViableSuccessors(
764 const MachineBasicBlock *BB, const BlockChain &Chain,
765 const BlockFilterSet *BlockFilter,
767 // Adjust edge probabilities by excluding edges pointing to blocks that is
768 // either not in BlockFilter or is already in the current chain. Consider the
769 // following CFG:
770 //
771 // --->A
772 // | / \
773 // | B C
774 // | \ / \
775 // ----D E
776 //
777 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
778 // A->C is chosen as a fall-through, D won't be selected as a successor of C
779 // due to CFG constraint (the probability of C->D is not greater than
780 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
781 // when calculating the probability of C->D, D will be selected and we
782 // will get A C D B as the layout of this loop.
783 auto AdjustedSumProb = BranchProbability::getOne();
784 for (MachineBasicBlock *Succ : BB->successors()) {
785 bool SkipSucc = false;
786 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
787 SkipSucc = true;
788 } else {
789 BlockChain *SuccChain = BlockToChain[Succ];
790 if (SuccChain == &Chain) {
791 SkipSucc = true;
792 } else if (Succ != *SuccChain->begin()) {
793 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
794 << " -> Mid chain!\n");
795 continue;
796 }
797 }
798 if (SkipSucc)
799 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
800 else
801 Successors.push_back(Succ);
802 }
803
804 return AdjustedSumProb;
805}
806
807/// The helper function returns the branch probability that is adjusted
808/// or normalized over the new total \p AdjustedSumProb.
811 BranchProbability AdjustedSumProb) {
812 BranchProbability SuccProb;
813 uint32_t SuccProbN = OrigProb.getNumerator();
814 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
815 if (SuccProbN >= SuccProbD)
816 SuccProb = BranchProbability::getOne();
817 else
818 SuccProb = BranchProbability(SuccProbN, SuccProbD);
819
820 return SuccProb;
821}
822
823/// Check if \p BB has exactly the successors in \p Successors.
824static bool
827 if (BB.succ_size() != Successors.size())
828 return false;
829 // We don't want to count self-loops
830 if (Successors.count(&BB))
831 return false;
832 for (MachineBasicBlock *Succ : BB.successors())
833 if (!Successors.count(Succ))
834 return false;
835 return true;
836}
837
838/// Check if a block should be tail duplicated to increase fallthrough
839/// opportunities.
840/// \p BB Block to check.
841bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
842 // Blocks with single successors don't create additional fallthrough
843 // opportunities. Don't duplicate them. TODO: When conditional exits are
844 // analyzable, allow them to be duplicated.
845 bool IsSimple = TailDup.isSimpleBB(BB);
846
847 if (BB->succ_size() == 1)
848 return false;
849 return TailDup.shouldTailDuplicate(IsSimple, *BB);
850}
851
852/// Compare 2 BlockFrequency's with a small penalty for \p A.
853/// In order to be conservative, we apply a X% penalty to account for
854/// increased icache pressure and static heuristics. For small frequencies
855/// we use only the numerators to improve accuracy. For simplicity, we assume
856/// the penalty is less than 100%
857/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
859 BlockFrequency EntryFreq) {
860 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
861 BlockFrequency Gain = A - B;
862 return (Gain / ThresholdProb) >= EntryFreq;
863}
864
865/// Check the edge frequencies to see if tail duplication will increase
866/// fallthroughs. It only makes sense to call this function when
867/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
868/// always locally profitable if we would have picked \p Succ without
869/// considering duplication.
870bool MachineBlockPlacement::isProfitableToTailDup(
871 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
872 BranchProbability QProb, const BlockChain &Chain,
873 const BlockFilterSet *BlockFilter) {
874 // We need to do a probability calculation to make sure this is profitable.
875 // First: does succ have a successor that post-dominates? This affects the
876 // calculation. The 2 relevant cases are:
877 // BB BB
878 // | \Qout | \Qout
879 // P| C |P C
880 // = C' = C'
881 // | /Qin | /Qin
882 // | / | /
883 // Succ Succ
884 // / \ | \ V
885 // U/ =V |U \
886 // / \ = D
887 // D E | /
888 // | /
889 // |/
890 // PDom
891 // '=' : Branch taken for that CFG edge
892 // In the second case, Placing Succ while duplicating it into C prevents the
893 // fallthrough of Succ into either D or PDom, because they now have C as an
894 // unplaced predecessor
895
896 // Start by figuring out which case we fall into
897 MachineBasicBlock *PDom = nullptr;
899 // Only scan the relevant successors
900 auto AdjustedSuccSumProb =
901 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
902 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
903 auto BBFreq = MBFI->getBlockFreq(BB);
904 auto SuccFreq = MBFI->getBlockFreq(Succ);
905 BlockFrequency P = BBFreq * PProb;
906 BlockFrequency Qout = BBFreq * QProb;
907 BlockFrequency EntryFreq = MBFI->getEntryFreq();
908 // If there are no more successors, it is profitable to copy, as it strictly
909 // increases fallthrough.
910 if (SuccSuccs.size() == 0)
911 return greaterWithBias(P, Qout, EntryFreq);
912
913 auto BestSuccSucc = BranchProbability::getZero();
914 // Find the PDom or the best Succ if no PDom exists.
915 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
916 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
917 if (Prob > BestSuccSucc)
918 BestSuccSucc = Prob;
919 if (PDom == nullptr)
920 if (MPDT->dominates(SuccSucc, Succ)) {
921 PDom = SuccSucc;
922 break;
923 }
924 }
925 // For the comparisons, we need to know Succ's best incoming edge that isn't
926 // from BB.
927 auto SuccBestPred = BlockFrequency(0);
928 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
929 if (SuccPred == Succ || SuccPred == BB ||
930 BlockToChain[SuccPred] == &Chain ||
931 (BlockFilter && !BlockFilter->count(SuccPred)))
932 continue;
933 auto Freq =
934 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
935 if (Freq > SuccBestPred)
936 SuccBestPred = Freq;
937 }
938 // Qin is Succ's best unplaced incoming edge that isn't BB
939 BlockFrequency Qin = SuccBestPred;
940 // If it doesn't have a post-dominating successor, here is the calculation:
941 // BB BB
942 // | \Qout | \
943 // P| C | =
944 // = C' | C
945 // | /Qin | |
946 // | / | C' (+Succ)
947 // Succ Succ /|
948 // / \ | \/ |
949 // U/ =V | == |
950 // / \ | / \|
951 // D E D E
952 // '=' : Branch taken for that CFG edge
953 // Cost in the first case is: P + V
954 // For this calculation, we always assume P > Qout. If Qout > P
955 // The result of this function will be ignored at the caller.
956 // Let F = SuccFreq - Qin
957 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
958
959 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
960 BranchProbability UProb = BestSuccSucc;
961 BranchProbability VProb = AdjustedSuccSumProb - UProb;
962 BlockFrequency F = SuccFreq - Qin;
963 BlockFrequency V = SuccFreq * VProb;
964 BlockFrequency QinU = std::min(Qin, F) * UProb;
965 BlockFrequency BaseCost = P + V;
966 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
967 return greaterWithBias(BaseCost, DupCost, EntryFreq);
968 }
969 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
970 BranchProbability VProb = AdjustedSuccSumProb - UProb;
971 BlockFrequency U = SuccFreq * UProb;
972 BlockFrequency V = SuccFreq * VProb;
973 BlockFrequency F = SuccFreq - Qin;
974 // If there is a post-dominating successor, here is the calculation:
975 // BB BB BB BB
976 // | \Qout | \ | \Qout | \
977 // |P C | = |P C | =
978 // = C' |P C = C' |P C
979 // | /Qin | | | /Qin | |
980 // | / | C' (+Succ) | / | C' (+Succ)
981 // Succ Succ /| Succ Succ /|
982 // | \ V | \/ | | \ V | \/ |
983 // |U \ |U /\ =? |U = |U /\ |
984 // = D = = =?| | D | = =|
985 // | / |/ D | / |/ D
986 // | / | / | = | /
987 // |/ | / |/ | =
988 // Dom Dom Dom Dom
989 // '=' : Branch taken for that CFG edge
990 // The cost for taken branches in the first case is P + U
991 // Let F = SuccFreq - Qin
992 // The cost in the second case (assuming independence), given the layout:
993 // BB, Succ, (C+Succ), D, Dom or the layout:
994 // BB, Succ, D, Dom, (C+Succ)
995 // is Qout + max(F, Qin) * U + min(F, Qin)
996 // compare P + U vs Qout + P * U + Qin.
997 //
998 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
999 //
1000 // For the 3rd case, the cost is P + 2 * V
1001 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
1002 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
1003 if (UProb > AdjustedSuccSumProb / 2 &&
1004 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
1005 Chain, BlockFilter))
1006 // Cases 3 & 4
1007 return greaterWithBias(
1008 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
1009 EntryFreq);
1010 // Cases 1 & 2
1011 return greaterWithBias((P + U),
1012 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
1013 std::max(Qin, F) * UProb),
1014 EntryFreq);
1015}
1016
1017/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1018/// successors form the lower part of a trellis. A successor set S forms the
1019/// lower part of a trellis if all of the predecessors of S are either in S or
1020/// have all of S as successors. We ignore trellises where BB doesn't have 2
1021/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1022/// are very uncommon and complex to compute optimally. Allowing edges within S
1023/// is not strictly a trellis, but the same algorithm works, so we allow it.
1024bool MachineBlockPlacement::isTrellis(
1025 const MachineBasicBlock *BB,
1026 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1027 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1028 // Technically BB could form a trellis with branching factor higher than 2.
1029 // But that's extremely uncommon.
1030 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
1031 return false;
1032
1034 BB->successors());
1035 // To avoid reviewing the same predecessors twice.
1037
1038 for (MachineBasicBlock *Succ : ViableSuccs) {
1039 // Compile-time optimization: runtime is quadratic in the number of
1040 // predecessors. For such uncommon cases, exit early.
1041 if (Succ->pred_size() > PredecessorLimit)
1042 return false;
1043
1044 int PredCount = 0;
1045 for (auto *SuccPred : Succ->predecessors()) {
1046 // Allow triangle successors, but don't count them.
1047 if (Successors.count(SuccPred)) {
1048 // Make sure that it is actually a triangle.
1049 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1050 if (!Successors.count(CheckSucc))
1051 return false;
1052 continue;
1053 }
1054 const BlockChain *PredChain = BlockToChain[SuccPred];
1055 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1056 PredChain == &Chain || PredChain == BlockToChain[Succ])
1057 continue;
1058 ++PredCount;
1059 // Perform the successor check only once.
1060 if (!SeenPreds.insert(SuccPred).second)
1061 continue;
1062 if (!hasSameSuccessors(*SuccPred, Successors))
1063 return false;
1064 }
1065 // If one of the successors has only BB as a predecessor, it is not a
1066 // trellis.
1067 if (PredCount < 1)
1068 return false;
1069 }
1070 return true;
1071}
1072
1073/// Pick the highest total weight pair of edges that can both be laid out.
1074/// The edges in \p Edges[0] are assumed to have a different destination than
1075/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1076/// the individual highest weight edges to the 2 different destinations, or in
1077/// case of a conflict, one of them should be replaced with a 2nd best edge.
1078std::pair<MachineBlockPlacement::WeightedEdge,
1079 MachineBlockPlacement::WeightedEdge>
1080MachineBlockPlacement::getBestNonConflictingEdges(
1081 const MachineBasicBlock *BB,
1083 Edges) {
1084 // Sort the edges, and then for each successor, find the best incoming
1085 // predecessor. If the best incoming predecessors aren't the same,
1086 // then that is clearly the best layout. If there is a conflict, one of the
1087 // successors will have to fallthrough from the second best predecessor. We
1088 // compare which combination is better overall.
1089
1090 // Sort for highest frequency.
1091 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1092
1093 llvm::stable_sort(Edges[0], Cmp);
1094 llvm::stable_sort(Edges[1], Cmp);
1095 auto BestA = Edges[0].begin();
1096 auto BestB = Edges[1].begin();
1097 // Arrange for the correct answer to be in BestA and BestB
1098 // If the 2 best edges don't conflict, the answer is already there.
1099 if (BestA->Src == BestB->Src) {
1100 // Compare the total fallthrough of (Best + Second Best) for both pairs
1101 auto SecondBestA = std::next(BestA);
1102 auto SecondBestB = std::next(BestB);
1103 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1104 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1105 if (BestAScore < BestBScore)
1106 BestA = SecondBestA;
1107 else
1108 BestB = SecondBestB;
1109 }
1110 // Arrange for the BB edge to be in BestA if it exists.
1111 if (BestB->Src == BB)
1112 std::swap(BestA, BestB);
1113 return std::make_pair(*BestA, *BestB);
1114}
1115
1116/// Get the best successor from \p BB based on \p BB being part of a trellis.
1117/// We only handle trellises with 2 successors, so the algorithm is
1118/// straightforward: Find the best pair of edges that don't conflict. We find
1119/// the best incoming edge for each successor in the trellis. If those conflict,
1120/// we consider which of them should be replaced with the second best.
1121/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1122/// comes from \p BB, it will be in \p BestEdges[0]
1123MachineBlockPlacement::BlockAndTailDupResult
1124MachineBlockPlacement::getBestTrellisSuccessor(
1125 const MachineBasicBlock *BB,
1126 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1127 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1128 const BlockFilterSet *BlockFilter) {
1129
1130 BlockAndTailDupResult Result = {nullptr, false};
1132 BB->successors());
1133
1134 // We assume size 2 because it's common. For general n, we would have to do
1135 // the Hungarian algorithm, but it's not worth the complexity because more
1136 // than 2 successors is fairly uncommon, and a trellis even more so.
1137 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1138 return Result;
1139
1140 // Collect the edge frequencies of all edges that form the trellis.
1142 int SuccIndex = 0;
1143 for (auto *Succ : ViableSuccs) {
1144 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1145 // Skip any placed predecessors that are not BB
1146 if (SuccPred != BB) {
1147 if (BlockFilter && !BlockFilter->count(SuccPred))
1148 continue;
1149 const BlockChain *SuccPredChain = BlockToChain[SuccPred];
1150 if (SuccPredChain == &Chain || SuccPredChain == BlockToChain[Succ])
1151 continue;
1152 }
1153 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1154 MBPI->getEdgeProbability(SuccPred, Succ);
1155 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1156 }
1157 ++SuccIndex;
1158 }
1159
1160 // Pick the best combination of 2 edges from all the edges in the trellis.
1161 WeightedEdge BestA, BestB;
1162 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1163
1164 if (BestA.Src != BB) {
1165 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1166 // we shouldn't choose any successor. We've already looked and there's a
1167 // better fallthrough edge for all the successors.
1168 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1169 return Result;
1170 }
1171
1172 // Did we pick the triangle edge? If tail-duplication is profitable, do
1173 // that instead. Otherwise merge the triangle edge now while we know it is
1174 // optimal.
1175 if (BestA.Dest == BestB.Src) {
1176 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1177 // would be better.
1178 MachineBasicBlock *Succ1 = BestA.Dest;
1179 MachineBasicBlock *Succ2 = BestB.Dest;
1180 // Check to see if tail-duplication would be profitable.
1181 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ2) &&
1182 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1183 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1184 Chain, BlockFilter)) {
1186 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1187 dbgs() << " Selected: " << getBlockName(Succ2)
1188 << ", probability: " << Succ2Prob
1189 << " (Tail Duplicate)\n");
1190 Result.BB = Succ2;
1191 Result.ShouldTailDup = true;
1192 return Result;
1193 }
1194 }
1195 // We have already computed the optimal edge for the other side of the
1196 // trellis.
1197 ComputedEdges[BestB.Src] = {BestB.Dest, false};
1198
1199 auto TrellisSucc = BestA.Dest;
1201 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1202 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1203 << ", probability: " << SuccProb << " (Trellis)\n");
1204 Result.BB = TrellisSucc;
1205 return Result;
1206}
1207
1208/// When the option allowTailDupPlacement() is on, this method checks if the
1209/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1210/// into all of its unplaced, unfiltered predecessors, that are not BB.
1211bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1212 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1213 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1214 if (!shouldTailDuplicate(Succ))
1215 return false;
1216
1217 // The result of canTailDuplicate.
1218 bool Duplicate = true;
1219 // Number of possible duplication.
1220 unsigned int NumDup = 0;
1221
1222 // For CFG checking.
1224 BB->successors());
1225 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1226 // Make sure all unplaced and unfiltered predecessors can be
1227 // tail-duplicated into.
1228 // Skip any blocks that are already placed or not in this loop.
1229 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
1230 (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1231 continue;
1232 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1233 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1234 // This will result in a trellis after tail duplication, so we don't
1235 // need to copy Succ into this predecessor. In the presence
1236 // of a trellis tail duplication can continue to be profitable.
1237 // For example:
1238 // A A
1239 // |\ |\
1240 // | \ | \
1241 // | C | C+BB
1242 // | / | |
1243 // |/ | |
1244 // BB => BB |
1245 // |\ |\/|
1246 // | \ |/\|
1247 // | D | D
1248 // | / | /
1249 // |/ |/
1250 // Succ Succ
1251 //
1252 // After BB was duplicated into C, the layout looks like the one on the
1253 // right. BB and C now have the same successors. When considering
1254 // whether Succ can be duplicated into all its unplaced predecessors, we
1255 // ignore C.
1256 // We can do this because C already has a profitable fallthrough, namely
1257 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1258 // duplication and for this test.
1259 //
1260 // This allows trellises to be laid out in 2 separate chains
1261 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1262 // because it allows the creation of 2 fallthrough paths with links
1263 // between them, and we correctly identify the best layout for these
1264 // CFGs. We want to extend trellises that the user created in addition
1265 // to trellises created by tail-duplication, so we just look for the
1266 // CFG.
1267 continue;
1268 Duplicate = false;
1269 continue;
1270 }
1271 NumDup++;
1272 }
1273
1274 // No possible duplication in current filter set.
1275 if (NumDup == 0)
1276 return false;
1277
1278 // If profile information is available, findDuplicateCandidates can do more
1279 // precise benefit analysis.
1280 if (F->getFunction().hasProfileData())
1281 return true;
1282
1283 // This is mainly for function exit BB.
1284 // The integrated tail duplication is really designed for increasing
1285 // fallthrough from predecessors from Succ to its successors. We may need
1286 // other machanism to handle different cases.
1287 if (Succ->succ_empty())
1288 return true;
1289
1290 // Plus the already placed predecessor.
1291 NumDup++;
1292
1293 // If the duplication candidate has more unplaced predecessors than
1294 // successors, the extra duplication can't bring more fallthrough.
1295 //
1296 // Pred1 Pred2 Pred3
1297 // \ | /
1298 // \ | /
1299 // \ | /
1300 // Dup
1301 // / \
1302 // / \
1303 // Succ1 Succ2
1304 //
1305 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1306 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1307 // but the duplication into Pred3 can't increase fallthrough.
1308 //
1309 // A small number of extra duplication may not hurt too much. We need a better
1310 // heuristic to handle it.
1311 if ((NumDup > Succ->succ_size()) || !Duplicate)
1312 return false;
1313
1314 return true;
1315}
1316
1317/// Find chains of triangles where we believe it would be profitable to
1318/// tail-duplicate them all, but a local analysis would not find them.
1319/// There are 3 ways this can be profitable:
1320/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1321/// longer chains)
1322/// 2) The chains are statically correlated. Branch probabilities have a very
1323/// U-shaped distribution.
1324/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1325/// If the branches in a chain are likely to be from the same side of the
1326/// distribution as their predecessor, but are independent at runtime, this
1327/// transformation is profitable. (Because the cost of being wrong is a small
1328/// fixed cost, unlike the standard triangle layout where the cost of being
1329/// wrong scales with the # of triangles.)
1330/// 3) The chains are dynamically correlated. If the probability that a previous
1331/// branch was taken positively influences whether the next branch will be
1332/// taken
1333/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1334void MachineBlockPlacement::precomputeTriangleChains() {
1335 struct TriangleChain {
1336 std::vector<MachineBasicBlock *> Edges;
1337
1338 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1339 : Edges({src, dst}) {}
1340
1341 void append(MachineBasicBlock *dst) {
1342 assert(getKey()->isSuccessor(dst) &&
1343 "Attempting to append a block that is not a successor.");
1344 Edges.push_back(dst);
1345 }
1346
1347 unsigned count() const { return Edges.size() - 1; }
1348
1349 MachineBasicBlock *getKey() const { return Edges.back(); }
1350 };
1351
1352 if (TriangleChainCount == 0)
1353 return;
1354
1355 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1356 // Map from last block to the chain that contains it. This allows us to extend
1357 // chains as we find new triangles.
1359 for (MachineBasicBlock &BB : *F) {
1360 // If BB doesn't have 2 successors, it doesn't start a triangle.
1361 if (BB.succ_size() != 2)
1362 continue;
1363 MachineBasicBlock *PDom = nullptr;
1364 for (MachineBasicBlock *Succ : BB.successors()) {
1365 if (!MPDT->dominates(Succ, &BB))
1366 continue;
1367 PDom = Succ;
1368 break;
1369 }
1370 // If BB doesn't have a post-dominating successor, it doesn't form a
1371 // triangle.
1372 if (PDom == nullptr)
1373 continue;
1374 // If PDom has a hint that it is low probability, skip this triangle.
1375 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1376 continue;
1377 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1378 // we're looking for.
1379 if (!shouldTailDuplicate(PDom))
1380 continue;
1381 bool CanTailDuplicate = true;
1382 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1383 // isn't the kind of triangle we're looking for.
1384 for (MachineBasicBlock *Pred : PDom->predecessors()) {
1385 if (Pred == &BB)
1386 continue;
1387 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1388 CanTailDuplicate = false;
1389 break;
1390 }
1391 }
1392 // If we can't tail-duplicate PDom to its predecessors, then skip this
1393 // triangle.
1394 if (!CanTailDuplicate)
1395 continue;
1396
1397 // Now we have an interesting triangle. Insert it if it's not part of an
1398 // existing chain.
1399 // Note: This cannot be replaced with a call insert() or emplace() because
1400 // the find key is BB, but the insert/emplace key is PDom.
1401 auto Found = TriangleChainMap.find(&BB);
1402 // If it is, remove the chain from the map, grow it, and put it back in the
1403 // map with the end as the new key.
1404 if (Found != TriangleChainMap.end()) {
1405 TriangleChain Chain = std::move(Found->second);
1406 TriangleChainMap.erase(Found);
1407 Chain.append(PDom);
1408 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1409 } else {
1410 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1411 assert(InsertResult.second && "Block seen twice.");
1412 (void)InsertResult;
1413 }
1414 }
1415
1416 // Iterating over a DenseMap is safe here, because the only thing in the body
1417 // of the loop is inserting into another DenseMap (ComputedEdges).
1418 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1419 for (auto &ChainPair : TriangleChainMap) {
1420 TriangleChain &Chain = ChainPair.second;
1421 // Benchmarking has shown that due to branch correlation duplicating 2 or
1422 // more triangles is profitable, despite the calculations assuming
1423 // independence.
1424 if (Chain.count() < TriangleChainCount)
1425 continue;
1426 MachineBasicBlock *dst = Chain.Edges.back();
1427 Chain.Edges.pop_back();
1428 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1429 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1430 << getBlockName(dst)
1431 << " as pre-computed based on triangles.\n");
1432
1433 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1434 assert(InsertResult.second && "Block seen twice.");
1435 (void)InsertResult;
1436
1437 dst = src;
1438 }
1439 }
1440}
1441
1442// When profile is not present, return the StaticLikelyProb.
1443// When profile is available, we need to handle the triangle-shape CFG.
1444static BranchProbability
1446 if (!BB->getParent()->getFunction().hasProfileData())
1448 if (BB->succ_size() == 2) {
1449 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1450 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1451 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1452 /* See case 1 below for the cost analysis. For BB->Succ to
1453 * be taken with smaller cost, the following needs to hold:
1454 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1455 * So the threshold T in the calculation below
1456 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1457 * So T / (1 - T) = 2, Yielding T = 2/3
1458 * Also adding user specified branch bias, we have
1459 * T = (2/3)*(ProfileLikelyProb/50)
1460 * = (2*ProfileLikelyProb)/150)
1461 */
1462 return BranchProbability(2 * ProfileLikelyProb, 150);
1463 }
1464 }
1466}
1467
1468/// Checks to see if the layout candidate block \p Succ has a better layout
1469/// predecessor than \c BB. If yes, returns true.
1470/// \p SuccProb: The probability adjusted for only remaining blocks.
1471/// Only used for logging
1472/// \p RealSuccProb: The un-adjusted probability.
1473/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1474/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1475/// considered
1476bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1477 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1478 const BlockChain &SuccChain, BranchProbability SuccProb,
1479 BranchProbability RealSuccProb, const BlockChain &Chain,
1480 const BlockFilterSet *BlockFilter) {
1481
1482 // There isn't a better layout when there are no unscheduled predecessors.
1483 if (SuccChain.UnscheduledPredecessors == 0)
1484 return false;
1485
1486 // Compile-time optimization: runtime is quadratic in the number of
1487 // predecessors. For such uncommon cases, exit early.
1488 if (Succ->pred_size() > PredecessorLimit)
1489 return false;
1490
1491 // There are two basic scenarios here:
1492 // -------------------------------------
1493 // Case 1: triangular shape CFG (if-then):
1494 // BB
1495 // | \
1496 // | \
1497 // | Pred
1498 // | /
1499 // Succ
1500 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1501 // set Succ as the layout successor of BB. Picking Succ as BB's
1502 // successor breaks the CFG constraints (FIXME: define these constraints).
1503 // With this layout, Pred BB
1504 // is forced to be outlined, so the overall cost will be cost of the
1505 // branch taken from BB to Pred, plus the cost of back taken branch
1506 // from Pred to Succ, as well as the additional cost associated
1507 // with the needed unconditional jump instruction from Pred To Succ.
1508
1509 // The cost of the topological order layout is the taken branch cost
1510 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1511 // must hold:
1512 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1513 // < freq(BB->Succ) * taken_branch_cost.
1514 // Ignoring unconditional jump cost, we get
1515 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1516 // prob(BB->Succ) > 2 * prob(BB->Pred)
1517 //
1518 // When real profile data is available, we can precisely compute the
1519 // probability threshold that is needed for edge BB->Succ to be considered.
1520 // Without profile data, the heuristic requires the branch bias to be
1521 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1522 // -----------------------------------------------------------------
1523 // Case 2: diamond like CFG (if-then-else):
1524 // S
1525 // / \
1526 // | \
1527 // BB Pred
1528 // \ /
1529 // Succ
1530 // ..
1531 //
1532 // The current block is BB and edge BB->Succ is now being evaluated.
1533 // Note that edge S->BB was previously already selected because
1534 // prob(S->BB) > prob(S->Pred).
1535 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1536 // choose Pred, we will have a topological ordering as shown on the left
1537 // in the picture below. If we choose Succ, we have the solution as shown
1538 // on the right:
1539 //
1540 // topo-order:
1541 //
1542 // S----- ---S
1543 // | | | |
1544 // ---BB | | BB
1545 // | | | |
1546 // | Pred-- | Succ--
1547 // | | | |
1548 // ---Succ ---Pred--
1549 //
1550 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1551 // = freq(S->Pred) + freq(S->BB)
1552 //
1553 // If we have profile data (i.e, branch probabilities can be trusted), the
1554 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1555 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1556 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1557 // means the cost of topological order is greater.
1558 // When profile data is not available, however, we need to be more
1559 // conservative. If the branch prediction is wrong, breaking the topo-order
1560 // will actually yield a layout with large cost. For this reason, we need
1561 // strong biased branch at block S with Prob(S->BB) in order to select
1562 // BB->Succ. This is equivalent to looking the CFG backward with backward
1563 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1564 // profile data).
1565 // --------------------------------------------------------------------------
1566 // Case 3: forked diamond
1567 // S
1568 // / \
1569 // / \
1570 // BB Pred
1571 // | \ / |
1572 // | \ / |
1573 // | X |
1574 // | / \ |
1575 // | / \ |
1576 // S1 S2
1577 //
1578 // The current block is BB and edge BB->S1 is now being evaluated.
1579 // As above S->BB was already selected because
1580 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1581 //
1582 // topo-order:
1583 //
1584 // S-------| ---S
1585 // | | | |
1586 // ---BB | | BB
1587 // | | | |
1588 // | Pred----| | S1----
1589 // | | | |
1590 // --(S1 or S2) ---Pred--
1591 // |
1592 // S2
1593 //
1594 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1595 // + min(freq(Pred->S1), freq(Pred->S2))
1596 // Non-topo-order cost:
1597 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1598 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1599 // is 0. Then the non topo layout is better when
1600 // freq(S->Pred) < freq(BB->S1).
1601 // This is exactly what is checked below.
1602 // Note there are other shapes that apply (Pred may not be a single block,
1603 // but they all fit this general pattern.)
1605
1606 // Make sure that a hot successor doesn't have a globally more
1607 // important predecessor.
1608 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1609 bool BadCFGConflict = false;
1610
1611 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1612 BlockChain *PredChain = BlockToChain[Pred];
1613 if (Pred == Succ || PredChain == &SuccChain ||
1614 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
1615 Pred != *std::prev(PredChain->end()) ||
1616 // This check is redundant except for look ahead. This function is
1617 // called for lookahead by isProfitableToTailDup when BB hasn't been
1618 // placed yet.
1619 (Pred == BB))
1620 continue;
1621 // Do backward checking.
1622 // For all cases above, we need a backward checking to filter out edges that
1623 // are not 'strongly' biased.
1624 // BB Pred
1625 // \ /
1626 // Succ
1627 // We select edge BB->Succ if
1628 // freq(BB->Succ) > freq(Succ) * HotProb
1629 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1630 // HotProb
1631 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1632 // Case 1 is covered too, because the first equation reduces to:
1633 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1634 BlockFrequency PredEdgeFreq =
1635 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1636 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1637 BadCFGConflict = true;
1638 break;
1639 }
1640 }
1641
1642 if (BadCFGConflict) {
1643 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1644 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1645 return true;
1646 }
1647
1648 return false;
1649}
1650
1651/// Select the best successor for a block.
1652///
1653/// This looks across all successors of a particular block and attempts to
1654/// select the "best" one to be the layout successor. It only considers direct
1655/// successors which also pass the block filter. It will attempt to avoid
1656/// breaking CFG structure, but cave and break such structures in the case of
1657/// very hot successor edges.
1658///
1659/// \returns The best successor block found, or null if none are viable, along
1660/// with a boolean indicating if tail duplication is necessary.
1661MachineBlockPlacement::BlockAndTailDupResult
1662MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1663 const BlockChain &Chain,
1664 const BlockFilterSet *BlockFilter) {
1665 const BranchProbability HotProb(StaticLikelyProb, 100);
1666
1667 BlockAndTailDupResult BestSucc = {nullptr, false};
1668 auto BestProb = BranchProbability::getZero();
1669
1671 auto AdjustedSumProb =
1672 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1673
1674 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1675 << "\n");
1676
1677 // if we already precomputed the best successor for BB, return that if still
1678 // applicable.
1679 auto FoundEdge = ComputedEdges.find(BB);
1680 if (FoundEdge != ComputedEdges.end()) {
1681 MachineBasicBlock *Succ = FoundEdge->second.BB;
1682 ComputedEdges.erase(FoundEdge);
1683 BlockChain *SuccChain = BlockToChain[Succ];
1684 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1685 SuccChain != &Chain && Succ == *SuccChain->begin())
1686 return FoundEdge->second;
1687 }
1688
1689 // if BB is part of a trellis, Use the trellis to determine the optimal
1690 // fallthrough edges
1691 if (isTrellis(BB, Successors, Chain, BlockFilter))
1692 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1693 BlockFilter);
1694
1695 // For blocks with CFG violations, we may be able to lay them out anyway with
1696 // tail-duplication. We keep this vector so we can perform the probability
1697 // calculations the minimum number of times.
1699 DupCandidates;
1700 for (MachineBasicBlock *Succ : Successors) {
1701 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1702 BranchProbability SuccProb =
1703 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1704
1705 BlockChain &SuccChain = *BlockToChain[Succ];
1706 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1707 // predecessor that yields lower global cost.
1708 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1709 Chain, BlockFilter)) {
1710 // If tail duplication would make Succ profitable, place it.
1711 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ))
1712 DupCandidates.emplace_back(SuccProb, Succ);
1713 continue;
1714 }
1715
1716 LLVM_DEBUG(
1717 dbgs() << " Candidate: " << getBlockName(Succ)
1718 << ", probability: " << SuccProb
1719 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1720 << "\n");
1721
1722 if (BestSucc.BB && BestProb >= SuccProb) {
1723 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1724 continue;
1725 }
1726
1727 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1728 BestSucc.BB = Succ;
1729 BestProb = SuccProb;
1730 }
1731 // Handle the tail duplication candidates in order of decreasing probability.
1732 // Stop at the first one that is profitable. Also stop if they are less
1733 // profitable than BestSucc. Position is important because we preserve it and
1734 // prefer first best match. Here we aren't comparing in order, so we capture
1735 // the position instead.
1736 llvm::stable_sort(DupCandidates,
1737 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1738 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1739 return std::get<0>(L) > std::get<0>(R);
1740 });
1741 for (auto &Tup : DupCandidates) {
1742 BranchProbability DupProb;
1743 MachineBasicBlock *Succ;
1744 std::tie(DupProb, Succ) = Tup;
1745 if (DupProb < BestProb)
1746 break;
1747 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1748 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1749 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1750 << ", probability: " << DupProb
1751 << " (Tail Duplicate)\n");
1752 BestSucc.BB = Succ;
1753 BestSucc.ShouldTailDup = true;
1754 break;
1755 }
1756 }
1757
1758 if (BestSucc.BB)
1759 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1760
1761 return BestSucc;
1762}
1763
1764/// Select the best block from a worklist.
1765///
1766/// This looks through the provided worklist as a list of candidate basic
1767/// blocks and select the most profitable one to place. The definition of
1768/// profitable only really makes sense in the context of a loop. This returns
1769/// the most frequently visited block in the worklist, which in the case of
1770/// a loop, is the one most desirable to be physically close to the rest of the
1771/// loop body in order to improve i-cache behavior.
1772///
1773/// \returns The best block found, or null if none are viable.
1774MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1775 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1776 // Once we need to walk the worklist looking for a candidate, cleanup the
1777 // worklist of already placed entries.
1778 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1779 // some code complexity) into the loop below.
1780 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1781 return BlockToChain.lookup(BB) == &Chain;
1782 });
1783
1784 if (WorkList.empty())
1785 return nullptr;
1786
1787 bool IsEHPad = WorkList[0]->isEHPad();
1788
1789 MachineBasicBlock *BestBlock = nullptr;
1790 BlockFrequency BestFreq;
1791 for (MachineBasicBlock *MBB : WorkList) {
1792 assert(MBB->isEHPad() == IsEHPad &&
1793 "EHPad mismatch between block and work list.");
1794
1795 BlockChain &SuccChain = *BlockToChain[MBB];
1796 if (&SuccChain == &Chain)
1797 continue;
1798
1799 assert(SuccChain.UnscheduledPredecessors == 0 &&
1800 "Found CFG-violating block");
1801
1802 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1803 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1804 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1805 << " (freq)\n");
1806
1807 // For ehpad, we layout the least probable first as to avoid jumping back
1808 // from least probable landingpads to more probable ones.
1809 //
1810 // FIXME: Using probability is probably (!) not the best way to achieve
1811 // this. We should probably have a more principled approach to layout
1812 // cleanup code.
1813 //
1814 // The goal is to get:
1815 //
1816 // +--------------------------+
1817 // | V
1818 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1819 //
1820 // Rather than:
1821 //
1822 // +-------------------------------------+
1823 // V |
1824 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1825 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1826 continue;
1827
1828 BestBlock = MBB;
1829 BestFreq = CandidateFreq;
1830 }
1831
1832 return BestBlock;
1833}
1834
1835/// Retrieve the first unplaced basic block in the entire function.
1836///
1837/// This routine is called when we are unable to use the CFG to walk through
1838/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1839/// We walk through the function's blocks in order, starting from the
1840/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1841/// re-scanning the entire sequence on repeated calls to this routine.
1842MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1843 const BlockChain &PlacedChain,
1844 MachineFunction::iterator &PrevUnplacedBlockIt) {
1845
1846 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1847 ++I) {
1848 if (BlockChain *Chain = BlockToChain[&*I]; Chain != &PlacedChain) {
1849 PrevUnplacedBlockIt = I;
1850 // Now select the head of the chain to which the unplaced block belongs
1851 // as the block to place. This will force the entire chain to be placed,
1852 // and satisfies the requirements of merging chains.
1853 return *Chain->begin();
1854 }
1855 }
1856 return nullptr;
1857}
1858
1859/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1860///
1861/// This is similar to getFirstUnplacedBlock for the entire function, but since
1862/// the size of BlockFilter is typically far less than the number of blocks in
1863/// the entire function, iterating through the BlockFilter is more efficient.
1864/// When processing the entire funciton, using the version without BlockFilter
1865/// has a complexity of #(loops in function) * #(blocks in function), while this
1866/// version has a complexity of sum(#(loops in block) foreach block in function)
1867/// which is always smaller. For long function mostly sequential in structure,
1868/// the complexity is amortized to 1 * #(blocks in function).
1869MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1870 const BlockChain &PlacedChain,
1871 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1872 const BlockFilterSet *BlockFilter) {
1873 assert(BlockFilter);
1874 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1875 ++PrevUnplacedBlockInFilterIt) {
1876 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1877 if (C != &PlacedChain) {
1878 return *C->begin();
1879 }
1880 }
1881 return nullptr;
1882}
1883
1884void MachineBlockPlacement::fillWorkLists(
1886 const BlockFilterSet *BlockFilter = nullptr) {
1887 BlockChain &Chain = *BlockToChain[MBB];
1888 if (!UpdatedPreds.insert(&Chain).second)
1889 return;
1890
1891 assert(
1892 Chain.UnscheduledPredecessors == 0 &&
1893 "Attempting to place block with unscheduled predecessors in worklist.");
1894 for (MachineBasicBlock *ChainBB : Chain) {
1895 assert(BlockToChain[ChainBB] == &Chain &&
1896 "Block in chain doesn't match BlockToChain map.");
1897 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1898 if (BlockFilter && !BlockFilter->count(Pred))
1899 continue;
1900 if (BlockToChain[Pred] == &Chain)
1901 continue;
1902 ++Chain.UnscheduledPredecessors;
1903 }
1904 }
1905
1906 if (Chain.UnscheduledPredecessors != 0)
1907 return;
1908
1909 MachineBasicBlock *BB = *Chain.begin();
1910 if (BB->isEHPad())
1911 EHPadWorkList.push_back(BB);
1912 else
1913 BlockWorkList.push_back(BB);
1914}
1915
1916void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1917 BlockChain &Chain,
1918 BlockFilterSet *BlockFilter) {
1919 assert(HeadBB && "BB must not be null.\n");
1920 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1921 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1922 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1923 if (BlockFilter)
1924 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1925
1926 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1927 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1928 MachineBasicBlock *BB = *std::prev(Chain.end());
1929 while (true) {
1930 assert(BB && "null block found at end of chain in loop.");
1931 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1932 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1933
1934 // Look for the best viable successor if there is one to place immediately
1935 // after this block.
1936 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1937 MachineBasicBlock *BestSucc = Result.BB;
1938 bool ShouldTailDup = Result.ShouldTailDup;
1939 if (allowTailDupPlacement(*F))
1940 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
1941 BB, BestSucc, Chain, BlockFilter));
1942
1943 // If an immediate successor isn't available, look for the best viable
1944 // block among those we've identified as not violating the loop's CFG at
1945 // this point. This won't be a fallthrough, but it will increase locality.
1946 if (!BestSucc)
1947 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1948 if (!BestSucc)
1949 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1950
1951 if (!BestSucc) {
1952 if (BlockFilter)
1953 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1954 BlockFilter);
1955 else
1956 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1957 if (!BestSucc)
1958 break;
1959
1960 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1961 "layout successor until the CFG reduces\n");
1962 }
1963
1964 // Placement may have changed tail duplication opportunities.
1965 // Check for that now.
1966 if (allowTailDupPlacement(*F) && BestSucc && ShouldTailDup) {
1967 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1968 BlockFilter, PrevUnplacedBlockIt,
1969 PrevUnplacedBlockInFilterIt);
1970 // If the chosen successor was duplicated into BB, don't bother laying
1971 // it out, just go round the loop again with BB as the chain end.
1972 if (!BB->isSuccessor(BestSucc))
1973 continue;
1974 }
1975
1976 // Place this block, updating the datastructures to reflect its placement.
1977 BlockChain &SuccChain = *BlockToChain[BestSucc];
1978 // Zero out UnscheduledPredecessors for the successor we're about to merge
1979 // in case we selected a successor that didn't fit naturally into the CFG.
1980 SuccChain.UnscheduledPredecessors = 0;
1981 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1982 << getBlockName(BestSucc) << "\n");
1983 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1984 Chain.merge(BestSucc, &SuccChain);
1985 BB = *std::prev(Chain.end());
1986 }
1987
1988 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1989 << getBlockName(*Chain.begin()) << "\n");
1990}
1991
1992// If bottom of block BB has only one successor OldTop, in most cases it is
1993// profitable to move it before OldTop, except the following case:
1994//
1995// -->OldTop<-
1996// | . |
1997// | . |
1998// | . |
1999// ---Pred |
2000// | |
2001// BB-----
2002//
2003// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2004// layout the other successor below it, so it can't reduce taken branch.
2005// In this case we keep its original layout.
2006bool MachineBlockPlacement::canMoveBottomBlockToTop(
2007 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
2008 if (BottomBlock->pred_size() != 1)
2009 return true;
2010 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
2011 if (Pred->succ_size() != 2)
2012 return true;
2013
2014 MachineBasicBlock *OtherBB = *Pred->succ_begin();
2015 if (OtherBB == BottomBlock)
2016 OtherBB = *Pred->succ_rbegin();
2017 if (OtherBB == OldTop)
2018 return false;
2019
2020 return true;
2021}
2022
2023// Find out the possible fall through frequence to the top of a loop.
2025MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2026 const BlockFilterSet &LoopBlockSet) {
2027 BlockFrequency MaxFreq = BlockFrequency(0);
2028 for (MachineBasicBlock *Pred : Top->predecessors()) {
2029 BlockChain *PredChain = BlockToChain[Pred];
2030 if (!LoopBlockSet.count(Pred) &&
2031 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2032 // Found a Pred block can be placed before Top.
2033 // Check if Top is the best successor of Pred.
2034 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2035 bool TopOK = true;
2036 for (MachineBasicBlock *Succ : Pred->successors()) {
2037 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2038 BlockChain *SuccChain = BlockToChain[Succ];
2039 // Check if Succ can be placed after Pred.
2040 // Succ should not be in any chain, or it is the head of some chain.
2041 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
2042 (!SuccChain || Succ == *SuccChain->begin())) {
2043 TopOK = false;
2044 break;
2045 }
2046 }
2047 if (TopOK) {
2048 BlockFrequency EdgeFreq =
2049 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
2050 if (EdgeFreq > MaxFreq)
2051 MaxFreq = EdgeFreq;
2052 }
2053 }
2054 }
2055 return MaxFreq;
2056}
2057
2058// Compute the fall through gains when move NewTop before OldTop.
2059//
2060// In following diagram, edges marked as "-" are reduced fallthrough, edges
2061// marked as "+" are increased fallthrough, this function computes
2062//
2063// SUM(increased fallthrough) - SUM(decreased fallthrough)
2064//
2065// |
2066// | -
2067// V
2068// --->OldTop
2069// | .
2070// | .
2071// +| . +
2072// | Pred --->
2073// | |-
2074// | V
2075// --- NewTop <---
2076// |-
2077// V
2078//
2079BlockFrequency MachineBlockPlacement::FallThroughGains(
2080 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
2081 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
2082 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2083 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2084 if (ExitBB)
2085 FallThrough2Exit =
2086 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
2087 BlockFrequency BackEdgeFreq =
2088 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
2089
2090 // Find the best Pred of NewTop.
2091 MachineBasicBlock *BestPred = nullptr;
2092 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2093 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2094 if (!LoopBlockSet.count(Pred))
2095 continue;
2096 BlockChain *PredChain = BlockToChain[Pred];
2097 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2098 BlockFrequency EdgeFreq =
2099 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2100 if (EdgeFreq > FallThroughFromPred) {
2101 FallThroughFromPred = EdgeFreq;
2102 BestPred = Pred;
2103 }
2104 }
2105 }
2106
2107 // If NewTop is not placed after Pred, another successor can be placed
2108 // after Pred.
2109 BlockFrequency NewFreq = BlockFrequency(0);
2110 if (BestPred) {
2111 for (MachineBasicBlock *Succ : BestPred->successors()) {
2112 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2113 continue;
2114 if (ComputedEdges.contains(Succ))
2115 continue;
2116 BlockChain *SuccChain = BlockToChain[Succ];
2117 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2118 (SuccChain == BlockToChain[BestPred]))
2119 continue;
2120 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2121 MBPI->getEdgeProbability(BestPred, Succ);
2122 if (EdgeFreq > NewFreq)
2123 NewFreq = EdgeFreq;
2124 }
2125 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2126 MBPI->getEdgeProbability(BestPred, NewTop);
2127 if (NewFreq > OrigEdgeFreq) {
2128 // If NewTop is not the best successor of Pred, then Pred doesn't
2129 // fallthrough to NewTop. So there is no FallThroughFromPred and
2130 // NewFreq.
2131 NewFreq = BlockFrequency(0);
2132 FallThroughFromPred = BlockFrequency(0);
2133 }
2134 }
2135
2137 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2138 BlockFrequency Lost =
2139 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2140 if (Gains > Lost)
2141 Result = Gains - Lost;
2142 return Result;
2143}
2144
2145/// Helper function of findBestLoopTop. Find the best loop top block
2146/// from predecessors of old top.
2147///
2148/// Look for a block which is strictly better than the old top for laying
2149/// out before the old top of the loop. This looks for only two patterns:
2150///
2151/// 1. a block has only one successor, the old loop top
2152///
2153/// Because such a block will always result in an unconditional jump,
2154/// rotating it in front of the old top is always profitable.
2155///
2156/// 2. a block has two successors, one is old top, another is exit
2157/// and it has more than one predecessors
2158///
2159/// If it is below one of its predecessors P, only P can fall through to
2160/// it, all other predecessors need a jump to it, and another conditional
2161/// jump to loop header. If it is moved before loop header, all its
2162/// predecessors jump to it, then fall through to loop header. So all its
2163/// predecessors except P can reduce one taken branch.
2164/// At the same time, move it before old top increases the taken branch
2165/// to loop exit block, so the reduced taken branch will be compared with
2166/// the increased taken branch to the loop exit block.
2167MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2168 MachineBasicBlock *OldTop, const MachineLoop &L,
2169 const BlockFilterSet &LoopBlockSet) {
2170 // Check that the header hasn't been fused with a preheader block due to
2171 // crazy branches. If it has, we need to start with the header at the top to
2172 // prevent pulling the preheader into the loop body.
2173 BlockChain &HeaderChain = *BlockToChain[OldTop];
2174 if (!LoopBlockSet.count(*HeaderChain.begin()))
2175 return OldTop;
2176 if (OldTop != *HeaderChain.begin())
2177 return OldTop;
2178
2179 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2180 << "\n");
2181
2182 BlockFrequency BestGains = BlockFrequency(0);
2183 MachineBasicBlock *BestPred = nullptr;
2184 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2185 if (!LoopBlockSet.count(Pred))
2186 continue;
2187 if (Pred == L.getHeader())
2188 continue;
2189 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2190 << Pred->succ_size() << " successors, "
2191 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2192 if (Pred->succ_size() > 2)
2193 continue;
2194
2195 MachineBasicBlock *OtherBB = nullptr;
2196 if (Pred->succ_size() == 2) {
2197 OtherBB = *Pred->succ_begin();
2198 if (OtherBB == OldTop)
2199 OtherBB = *Pred->succ_rbegin();
2200 }
2201
2202 if (!canMoveBottomBlockToTop(Pred, OldTop))
2203 continue;
2204
2205 BlockFrequency Gains =
2206 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
2207 if ((Gains > BlockFrequency(0)) &&
2208 (Gains > BestGains ||
2209 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2210 BestPred = Pred;
2211 BestGains = Gains;
2212 }
2213 }
2214
2215 // If no direct predecessor is fine, just use the loop header.
2216 if (!BestPred) {
2217 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2218 return OldTop;
2219 }
2220
2221 // Walk backwards through any straight line of predecessors.
2222 while (BestPred->pred_size() == 1 &&
2223 (*BestPred->pred_begin())->succ_size() == 1 &&
2224 *BestPred->pred_begin() != L.getHeader())
2225 BestPred = *BestPred->pred_begin();
2226
2227 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2228 return BestPred;
2229}
2230
2231/// Find the best loop top block for layout.
2232///
2233/// This function iteratively calls findBestLoopTopHelper, until no new better
2234/// BB can be found.
2236MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2237 const BlockFilterSet &LoopBlockSet) {
2238 // Placing the latch block before the header may introduce an extra branch
2239 // that skips this block the first time the loop is executed, which we want
2240 // to avoid when optimising for size.
2241 // FIXME: in theory there is a case that does not introduce a new branch,
2242 // i.e. when the layout predecessor does not fallthrough to the loop header.
2243 // In practice this never happens though: there always seems to be a preheader
2244 // that can fallthrough and that is also placed before the header.
2245 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()))
2246 return L.getHeader();
2247
2248 MachineBasicBlock *OldTop = nullptr;
2249 MachineBasicBlock *NewTop = L.getHeader();
2250 while (NewTop != OldTop) {
2251 OldTop = NewTop;
2252 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2253 if (NewTop != OldTop)
2254 ComputedEdges[NewTop] = {OldTop, false};
2255 }
2256 return NewTop;
2257}
2258
2259/// Find the best loop exiting block for layout.
2260///
2261/// This routine implements the logic to analyze the loop looking for the best
2262/// block to layout at the top of the loop. Typically this is done to maximize
2263/// fallthrough opportunities.
2265MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2266 const BlockFilterSet &LoopBlockSet,
2267 BlockFrequency &ExitFreq) {
2268 // We don't want to layout the loop linearly in all cases. If the loop header
2269 // is just a normal basic block in the loop, we want to look for what block
2270 // within the loop is the best one to layout at the top. However, if the loop
2271 // header has be pre-merged into a chain due to predecessors not having
2272 // analyzable branches, *and* the predecessor it is merged with is *not* part
2273 // of the loop, rotating the header into the middle of the loop will create
2274 // a non-contiguous range of blocks which is Very Bad. So start with the
2275 // header and only rotate if safe.
2276 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2277 if (!LoopBlockSet.count(*HeaderChain.begin()))
2278 return nullptr;
2279
2280 BlockFrequency BestExitEdgeFreq;
2281 unsigned BestExitLoopDepth = 0;
2282 MachineBasicBlock *ExitingBB = nullptr;
2283 // If there are exits to outer loops, loop rotation can severely limit
2284 // fallthrough opportunities unless it selects such an exit. Keep a set of
2285 // blocks where rotating to exit with that block will reach an outer loop.
2286 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2287
2288 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2289 << getBlockName(L.getHeader()) << "\n");
2290 for (MachineBasicBlock *MBB : L.getBlocks()) {
2291 BlockChain &Chain = *BlockToChain[MBB];
2292 // Ensure that this block is at the end of a chain; otherwise it could be
2293 // mid-way through an inner loop or a successor of an unanalyzable branch.
2294 if (MBB != *std::prev(Chain.end()))
2295 continue;
2296
2297 // Now walk the successors. We need to establish whether this has a viable
2298 // exiting successor and whether it has a viable non-exiting successor.
2299 // We store the old exiting state and restore it if a viable looping
2300 // successor isn't found.
2301 MachineBasicBlock *OldExitingBB = ExitingBB;
2302 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2303 bool HasLoopingSucc = false;
2304 for (MachineBasicBlock *Succ : MBB->successors()) {
2305 if (Succ->isEHPad())
2306 continue;
2307 if (Succ == MBB)
2308 continue;
2309 BlockChain &SuccChain = *BlockToChain[Succ];
2310 // Don't split chains, either this chain or the successor's chain.
2311 if (&Chain == &SuccChain) {
2312 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2313 << getBlockName(Succ) << " (chain conflict)\n");
2314 continue;
2315 }
2316
2317 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2318 if (LoopBlockSet.count(Succ)) {
2319 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2320 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2321 HasLoopingSucc = true;
2322 continue;
2323 }
2324
2325 unsigned SuccLoopDepth = 0;
2326 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2327 SuccLoopDepth = ExitLoop->getLoopDepth();
2328 if (ExitLoop->contains(&L))
2329 BlocksExitingToOuterLoop.insert(MBB);
2330 }
2331
2332 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2333 LLVM_DEBUG(
2334 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2335 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2336 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2337 // Note that we bias this toward an existing layout successor to retain
2338 // incoming order in the absence of better information. The exit must have
2339 // a frequency higher than the current exit before we consider breaking
2340 // the layout.
2341 BranchProbability Bias(100 - ExitBlockBias, 100);
2342 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2343 ExitEdgeFreq > BestExitEdgeFreq ||
2344 (MBB->isLayoutSuccessor(Succ) &&
2345 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2346 BestExitEdgeFreq = ExitEdgeFreq;
2347 ExitingBB = MBB;
2348 }
2349 }
2350
2351 if (!HasLoopingSucc) {
2352 // Restore the old exiting state, no viable looping successor was found.
2353 ExitingBB = OldExitingBB;
2354 BestExitEdgeFreq = OldBestExitEdgeFreq;
2355 }
2356 }
2357 // Without a candidate exiting block or with only a single block in the
2358 // loop, just use the loop header to layout the loop.
2359 if (!ExitingBB) {
2360 LLVM_DEBUG(
2361 dbgs() << " No other candidate exit blocks, using loop header\n");
2362 return nullptr;
2363 }
2364 if (L.getNumBlocks() == 1) {
2365 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2366 return nullptr;
2367 }
2368
2369 // Also, if we have exit blocks which lead to outer loops but didn't select
2370 // one of them as the exiting block we are rotating toward, disable loop
2371 // rotation altogether.
2372 if (!BlocksExitingToOuterLoop.empty() &&
2373 !BlocksExitingToOuterLoop.count(ExitingBB))
2374 return nullptr;
2375
2376 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2377 << "\n");
2378 ExitFreq = BestExitEdgeFreq;
2379 return ExitingBB;
2380}
2381
2382/// Check if there is a fallthrough to loop header Top.
2383///
2384/// 1. Look for a Pred that can be layout before Top.
2385/// 2. Check if Top is the most possible successor of Pred.
2386bool MachineBlockPlacement::hasViableTopFallthrough(
2387 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
2388 for (MachineBasicBlock *Pred : Top->predecessors()) {
2389 BlockChain *PredChain = BlockToChain[Pred];
2390 if (!LoopBlockSet.count(Pred) &&
2391 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2392 // Found a Pred block can be placed before Top.
2393 // Check if Top is the best successor of Pred.
2394 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2395 bool TopOK = true;
2396 for (MachineBasicBlock *Succ : Pred->successors()) {
2397 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2398 BlockChain *SuccChain = BlockToChain[Succ];
2399 // Check if Succ can be placed after Pred.
2400 // Succ should not be in any chain, or it is the head of some chain.
2401 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2402 TopOK = false;
2403 break;
2404 }
2405 }
2406 if (TopOK)
2407 return true;
2408 }
2409 }
2410 return false;
2411}
2412
2413/// Attempt to rotate an exiting block to the bottom of the loop.
2414///
2415/// Once we have built a chain, try to rotate it to line up the hot exit block
2416/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2417/// branches. For example, if the loop has fallthrough into its header and out
2418/// of its bottom already, don't rotate it.
2419void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2420 const MachineBasicBlock *ExitingBB,
2421 BlockFrequency ExitFreq,
2422 const BlockFilterSet &LoopBlockSet) {
2423 if (!ExitingBB)
2424 return;
2425
2426 MachineBasicBlock *Top = *LoopChain.begin();
2427 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2428
2429 // If ExitingBB is already the last one in a chain then nothing to do.
2430 if (Bottom == ExitingBB)
2431 return;
2432
2433 // The entry block should always be the first BB in a function.
2434 if (Top->isEntryBlock())
2435 return;
2436
2437 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2438
2439 // If the header has viable fallthrough, check whether the current loop
2440 // bottom is a viable exiting block. If so, bail out as rotating will
2441 // introduce an unnecessary branch.
2442 if (ViableTopFallthrough) {
2443 for (MachineBasicBlock *Succ : Bottom->successors()) {
2444 BlockChain *SuccChain = BlockToChain[Succ];
2445 if (!LoopBlockSet.count(Succ) &&
2446 (!SuccChain || Succ == *SuccChain->begin()))
2447 return;
2448 }
2449
2450 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2451 // frequency is larger than top fallthrough.
2452 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2453 if (FallThrough2Top >= ExitFreq)
2454 return;
2455 }
2456
2457 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2458 if (ExitIt == LoopChain.end())
2459 return;
2460
2461 // Rotating a loop exit to the bottom when there is a fallthrough to top
2462 // trades the entry fallthrough for an exit fallthrough.
2463 // If there is no bottom->top edge, but the chosen exit block does have
2464 // a fallthrough, we break that fallthrough for nothing in return.
2465
2466 // Let's consider an example. We have a built chain of basic blocks
2467 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2468 // By doing a rotation we get
2469 // Bk+1, ..., Bn, B1, ..., Bk
2470 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2471 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2472 // It might be compensated by fallthrough Bn -> B1.
2473 // So we have a condition to avoid creation of extra branch by loop rotation.
2474 // All below must be true to avoid loop rotation:
2475 // If there is a fallthrough to top (B1)
2476 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2477 // There is no fallthrough from bottom (Bn) to top (B1).
2478 // Please note that there is no exit fallthrough from Bn because we checked it
2479 // above.
2480 if (ViableTopFallthrough) {
2481 assert(std::next(ExitIt) != LoopChain.end() &&
2482 "Exit should not be last BB");
2483 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2484 if (ExitingBB->isSuccessor(NextBlockInChain))
2485 if (!Bottom->isSuccessor(Top))
2486 return;
2487 }
2488
2489 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2490 << " at bottom\n");
2491 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2492}
2493
2494/// Attempt to rotate a loop based on profile data to reduce branch cost.
2495///
2496/// With profile data, we can determine the cost in terms of missed fall through
2497/// opportunities when rotating a loop chain and select the best rotation.
2498/// Basically, there are three kinds of cost to consider for each rotation:
2499/// 1. The possibly missed fall through edge (if it exists) from BB out of
2500/// the loop to the loop header.
2501/// 2. The possibly missed fall through edges (if they exist) from the loop
2502/// exits to BB out of the loop.
2503/// 3. The missed fall through edge (if it exists) from the last BB to the
2504/// first BB in the loop chain.
2505/// Therefore, the cost for a given rotation is the sum of costs listed above.
2506/// We select the best rotation with the smallest cost.
2507void MachineBlockPlacement::rotateLoopWithProfile(
2508 BlockChain &LoopChain, const MachineLoop &L,
2509 const BlockFilterSet &LoopBlockSet) {
2510 auto RotationPos = LoopChain.end();
2511 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2512
2513 // The entry block should always be the first BB in a function.
2514 if (ChainHeaderBB->isEntryBlock())
2515 return;
2516
2517 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2518
2519 // A utility lambda that scales up a block frequency by dividing it by a
2520 // branch probability which is the reciprocal of the scale.
2521 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2522 unsigned Scale) -> BlockFrequency {
2523 if (Scale == 0)
2524 return BlockFrequency(0);
2525 // Use operator / between BlockFrequency and BranchProbability to implement
2526 // saturating multiplication.
2527 return Freq / BranchProbability(1, Scale);
2528 };
2529
2530 // Compute the cost of the missed fall-through edge to the loop header if the
2531 // chain head is not the loop header. As we only consider natural loops with
2532 // single header, this computation can be done only once.
2533 BlockFrequency HeaderFallThroughCost(0);
2534 for (auto *Pred : ChainHeaderBB->predecessors()) {
2535 BlockChain *PredChain = BlockToChain[Pred];
2536 if (!LoopBlockSet.count(Pred) &&
2537 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2538 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2539 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2540 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2541 // If the predecessor has only an unconditional jump to the header, we
2542 // need to consider the cost of this jump.
2543 if (Pred->succ_size() == 1)
2544 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2545 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2546 }
2547 }
2548
2549 // Here we collect all exit blocks in the loop, and for each exit we find out
2550 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2551 // as the sum of frequencies of exit edges we collect here, excluding the exit
2552 // edge from the tail of the loop chain.
2554 for (auto *BB : LoopChain) {
2555 auto LargestExitEdgeProb = BranchProbability::getZero();
2556 for (auto *Succ : BB->successors()) {
2557 BlockChain *SuccChain = BlockToChain[Succ];
2558 if (!LoopBlockSet.count(Succ) &&
2559 (!SuccChain || Succ == *SuccChain->begin())) {
2560 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2561 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2562 }
2563 }
2564 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2565 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2566 ExitsWithFreq.emplace_back(BB, ExitFreq);
2567 }
2568 }
2569
2570 // In this loop we iterate every block in the loop chain and calculate the
2571 // cost assuming the block is the head of the loop chain. When the loop ends,
2572 // we should have found the best candidate as the loop chain's head.
2573 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2574 EndIter = LoopChain.end();
2575 Iter != EndIter; Iter++, TailIter++) {
2576 // TailIter is used to track the tail of the loop chain if the block we are
2577 // checking (pointed by Iter) is the head of the chain.
2578 if (TailIter == LoopChain.end())
2579 TailIter = LoopChain.begin();
2580
2581 auto TailBB = *TailIter;
2582
2583 // Calculate the cost by putting this BB to the top.
2585
2586 // If the current BB is the loop header, we need to take into account the
2587 // cost of the missed fall through edge from outside of the loop to the
2588 // header.
2589 if (Iter != LoopChain.begin())
2590 Cost += HeaderFallThroughCost;
2591
2592 // Collect the loop exit cost by summing up frequencies of all exit edges
2593 // except the one from the chain tail.
2594 for (auto &ExitWithFreq : ExitsWithFreq)
2595 if (TailBB != ExitWithFreq.first)
2596 Cost += ExitWithFreq.second;
2597
2598 // The cost of breaking the once fall-through edge from the tail to the top
2599 // of the loop chain. Here we need to consider three cases:
2600 // 1. If the tail node has only one successor, then we will get an
2601 // additional jmp instruction. So the cost here is (MisfetchCost +
2602 // JumpInstCost) * tail node frequency.
2603 // 2. If the tail node has two successors, then we may still get an
2604 // additional jmp instruction if the layout successor after the loop
2605 // chain is not its CFG successor. Note that the more frequently executed
2606 // jmp instruction will be put ahead of the other one. Assume the
2607 // frequency of those two branches are x and y, where x is the frequency
2608 // of the edge to the chain head, then the cost will be
2609 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2610 // 3. If the tail node has more than two successors (this rarely happens),
2611 // we won't consider any additional cost.
2612 if (TailBB->isSuccessor(*Iter)) {
2613 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2614 if (TailBB->succ_size() == 1)
2615 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2616 else if (TailBB->succ_size() == 2) {
2617 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2618 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2619 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2620 ? TailBBFreq * TailToHeadProb.getCompl()
2621 : TailToHeadFreq;
2622 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2623 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2624 }
2625 }
2626
2627 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2628 << getBlockName(*Iter) << " to the top: "
2629 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2630
2631 if (Cost < SmallestRotationCost) {
2632 SmallestRotationCost = Cost;
2633 RotationPos = Iter;
2634 }
2635 }
2636
2637 if (RotationPos != LoopChain.end()) {
2638 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2639 << " to the top\n");
2640 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2641 }
2642}
2643
2644/// Collect blocks in the given loop that are to be placed.
2645///
2646/// When profile data is available, exclude cold blocks from the returned set;
2647/// otherwise, collect all blocks in the loop.
2649MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2650 // Collect the blocks in a set ordered by block number, as this gives the same
2651 // order as they appear in the function.
2652 struct MBBCompare {
2653 bool operator()(const MachineBasicBlock *X,
2654 const MachineBasicBlock *Y) const {
2655 return X->getNumber() < Y->getNumber();
2656 }
2657 };
2658 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2659
2660 // Filter cold blocks off from LoopBlockSet when profile data is available.
2661 // Collect the sum of frequencies of incoming edges to the loop header from
2662 // outside. If we treat the loop as a super block, this is the frequency of
2663 // the loop. Then for each block in the loop, we calculate the ratio between
2664 // its frequency and the frequency of the loop block. When it is too small,
2665 // don't add it to the loop chain. If there are outer loops, then this block
2666 // will be merged into the first outer loop chain for which this block is not
2667 // cold anymore. This needs precise profile data and we only do this when
2668 // profile data is available.
2669 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2670 BlockFrequency LoopFreq(0);
2671 for (auto *LoopPred : L.getHeader()->predecessors())
2672 if (!L.contains(LoopPred))
2673 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2674 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2675
2676 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2677 if (LoopBlockSet.count(LoopBB))
2678 continue;
2679 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2680 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2681 continue;
2682 BlockChain *Chain = BlockToChain[LoopBB];
2683 for (MachineBasicBlock *ChainBB : *Chain)
2684 LoopBlockSet.insert(ChainBB);
2685 }
2686 } else
2687 LoopBlockSet.insert(L.block_begin(), L.block_end());
2688
2689 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2690 // std::set. We will only remove blocks and never insert them, which will
2691 // preserve the ordering.
2692 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2693 return Ret;
2694}
2695
2696/// Forms basic block chains from the natural loop structures.
2697///
2698/// These chains are designed to preserve the existing *structure* of the code
2699/// as much as possible. We can then stitch the chains together in a way which
2700/// both preserves the topological structure and minimizes taken conditional
2701/// branches.
2702void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2703 // First recurse through any nested loops, building chains for those inner
2704 // loops.
2705 for (const MachineLoop *InnerLoop : L)
2706 buildLoopChains(*InnerLoop);
2707
2708 assert(BlockWorkList.empty() &&
2709 "BlockWorkList not empty when starting to build loop chains.");
2710 assert(EHPadWorkList.empty() &&
2711 "EHPadWorkList not empty when starting to build loop chains.");
2712 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2713
2714 // Check if we have profile data for this function. If yes, we will rotate
2715 // this loop by modeling costs more precisely which requires the profile data
2716 // for better layout.
2717 bool RotateLoopWithProfile =
2719 (PreciseRotationCost && F->getFunction().hasProfileData());
2720
2721 // First check to see if there is an obviously preferable top block for the
2722 // loop. This will default to the header, but may end up as one of the
2723 // predecessors to the header if there is one which will result in strictly
2724 // fewer branches in the loop body.
2725 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2726
2727 // If we selected just the header for the loop top, look for a potentially
2728 // profitable exit block in the event that rotating the loop can eliminate
2729 // branches by placing an exit edge at the bottom.
2730 //
2731 // Loops are processed innermost to uttermost, make sure we clear
2732 // PreferredLoopExit before processing a new loop.
2733 PreferredLoopExit = nullptr;
2734 BlockFrequency ExitFreq;
2735 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2736 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2737
2738 BlockChain &LoopChain = *BlockToChain[LoopTop];
2739
2740 // FIXME: This is a really lame way of walking the chains in the loop: we
2741 // walk the blocks, and use a set to prevent visiting a particular chain
2742 // twice.
2743 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2744 assert(LoopChain.UnscheduledPredecessors == 0 &&
2745 "LoopChain should not have unscheduled predecessors.");
2746 UpdatedPreds.insert(&LoopChain);
2747
2748 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2749 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2750
2751 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2752
2753 if (RotateLoopWithProfile)
2754 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2755 else
2756 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2757
2758 LLVM_DEBUG({
2759 // Crash at the end so we get all of the debugging output first.
2760 bool BadLoop = false;
2761 if (LoopChain.UnscheduledPredecessors) {
2762 BadLoop = true;
2763 dbgs() << "Loop chain contains a block without its preds placed!\n"
2764 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2765 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2766 }
2767 for (MachineBasicBlock *ChainBB : LoopChain) {
2768 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2769 if (!LoopBlockSet.remove(ChainBB)) {
2770 // We don't mark the loop as bad here because there are real situations
2771 // where this can occur. For example, with an unanalyzable fallthrough
2772 // from a loop block to a non-loop block or vice versa.
2773 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2774 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2775 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2776 << " Bad block: " << getBlockName(ChainBB) << "\n";
2777 }
2778 }
2779
2780 if (!LoopBlockSet.empty()) {
2781 BadLoop = true;
2782 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2783 dbgs() << "Loop contains blocks never placed into a chain!\n"
2784 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2785 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2786 << " Bad block: " << getBlockName(LoopBB) << "\n";
2787 }
2788 assert(!BadLoop && "Detected problems with the placement of this loop.");
2789 });
2790
2791 BlockWorkList.clear();
2792 EHPadWorkList.clear();
2793}
2794
2795void MachineBlockPlacement::buildCFGChains() {
2796 // Ensure that every BB in the function has an associated chain to simplify
2797 // the assumptions of the remaining algorithm.
2798 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2799 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2800 ++FI) {
2801 MachineBasicBlock *BB = &*FI;
2802 BlockChain *Chain =
2803 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2804 // Also, merge any blocks which we cannot reason about and must preserve
2805 // the exact fallthrough behavior for.
2806 while (true) {
2807 Cond.clear();
2808 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2809 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2810 break;
2811
2812 MachineFunction::iterator NextFI = std::next(FI);
2813 MachineBasicBlock *NextBB = &*NextFI;
2814 // Ensure that the layout successor is a viable block, as we know that
2815 // fallthrough is a possibility.
2816 assert(NextFI != FE && "Can't fallthrough past the last block.");
2817 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2818 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2819 << "\n");
2820 Chain->merge(NextBB, nullptr);
2821#ifndef NDEBUG
2822 BlocksWithUnanalyzableExits.insert(&*BB);
2823#endif
2824 FI = NextFI;
2825 BB = NextBB;
2826 }
2827 }
2828
2829 // Build any loop-based chains.
2830 PreferredLoopExit = nullptr;
2831 for (MachineLoop *L : *MLI)
2832 buildLoopChains(*L);
2833
2834 assert(BlockWorkList.empty() &&
2835 "BlockWorkList should be empty before building final chain.");
2836 assert(EHPadWorkList.empty() &&
2837 "EHPadWorkList should be empty before building final chain.");
2838
2839 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2840 for (MachineBasicBlock &MBB : *F)
2841 fillWorkLists(&MBB, UpdatedPreds);
2842
2843 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2844 buildChain(&F->front(), FunctionChain);
2845
2846#ifndef NDEBUG
2847 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2848#endif
2849 LLVM_DEBUG({
2850 // Crash at the end so we get all of the debugging output first.
2851 bool BadFunc = false;
2852 FunctionBlockSetType FunctionBlockSet;
2853 for (MachineBasicBlock &MBB : *F)
2854 FunctionBlockSet.insert(&MBB);
2855
2856 for (MachineBasicBlock *ChainBB : FunctionChain)
2857 if (!FunctionBlockSet.erase(ChainBB)) {
2858 BadFunc = true;
2859 dbgs() << "Function chain contains a block not in the function!\n"
2860 << " Bad block: " << getBlockName(ChainBB) << "\n";
2861 }
2862
2863 if (!FunctionBlockSet.empty()) {
2864 BadFunc = true;
2865 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2866 dbgs() << "Function contains blocks never placed into a chain!\n"
2867 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2868 }
2869 assert(!BadFunc && "Detected problems with the block placement.");
2870 });
2871
2872 // Remember original layout ordering, so we can update terminators after
2873 // reordering to point to the original layout successor.
2874 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2875 F->getNumBlockIDs());
2876 {
2877 MachineBasicBlock *LastMBB = nullptr;
2878 for (auto &MBB : *F) {
2879 if (LastMBB != nullptr)
2880 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2881 LastMBB = &MBB;
2882 }
2883 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2884 }
2885
2886 // Splice the blocks into place.
2887 MachineFunction::iterator InsertPos = F->begin();
2888 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2889 for (MachineBasicBlock *ChainBB : FunctionChain) {
2890 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2891 : " ... ")
2892 << getBlockName(ChainBB) << "\n");
2893 if (InsertPos != MachineFunction::iterator(ChainBB))
2894 F->splice(InsertPos, ChainBB);
2895 else
2896 ++InsertPos;
2897
2898 // Update the terminator of the previous block.
2899 if (ChainBB == *FunctionChain.begin())
2900 continue;
2901 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2902
2903 // FIXME: It would be awesome of updateTerminator would just return rather
2904 // than assert when the branch cannot be analyzed in order to remove this
2905 // boiler plate.
2906 Cond.clear();
2907 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2908
2909#ifndef NDEBUG
2910 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2911 // Given the exact block placement we chose, we may actually not _need_ to
2912 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2913 // do that at this point is a bug.
2914 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2915 !PrevBB->canFallThrough()) &&
2916 "Unexpected block with un-analyzable fallthrough!");
2917 Cond.clear();
2918 TBB = FBB = nullptr;
2919 }
2920#endif
2921
2922 // The "PrevBB" is not yet updated to reflect current code layout, so,
2923 // o. it may fall-through to a block without explicit "goto" instruction
2924 // before layout, and no longer fall-through it after layout; or
2925 // o. just opposite.
2926 //
2927 // analyzeBranch() may return erroneous value for FBB when these two
2928 // situations take place. For the first scenario FBB is mistakenly set NULL;
2929 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2930 // mistakenly pointing to "*BI".
2931 // Thus, if the future change needs to use FBB before the layout is set, it
2932 // has to correct FBB first by using the code similar to the following:
2933 //
2934 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2935 // PrevBB->updateTerminator();
2936 // Cond.clear();
2937 // TBB = FBB = nullptr;
2938 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2939 // // FIXME: This should never take place.
2940 // TBB = FBB = nullptr;
2941 // }
2942 // }
2943 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2944 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2945 }
2946 }
2947
2948 // Fixup the last block.
2949 Cond.clear();
2950 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2951 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2952 MachineBasicBlock *PrevBB = &F->back();
2953 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2954 }
2955
2956 BlockWorkList.clear();
2957 EHPadWorkList.clear();
2958}
2959
2960void MachineBlockPlacement::optimizeBranches() {
2961 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2963
2964 // Now that all the basic blocks in the chain have the proper layout,
2965 // make a final call to analyzeBranch with AllowModify set.
2966 // Indeed, the target may be able to optimize the branches in a way we
2967 // cannot because all branches may not be analyzable.
2968 // E.g., the target may be able to remove an unconditional branch to
2969 // a fallthrough when it occurs after predicated terminators.
2970 for (MachineBasicBlock *ChainBB : FunctionChain) {
2971 Cond.clear();
2972 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2973 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
2974 continue;
2975 if (!TBB || !FBB || Cond.empty())
2976 continue;
2977 // If we are optimizing for size we do not consider the runtime performance.
2978 // Instead, we retain the original branch condition so we have more uniform
2979 // instructions which will benefit ICF.
2980 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
2981 continue;
2982 // If ChainBB has a two-way branch, try to re-order the branches
2983 // such that we branch to the successor with higher probability first.
2984 if (MBPI->getEdgeProbability(ChainBB, TBB) >=
2985 MBPI->getEdgeProbability(ChainBB, FBB))
2986 continue;
2988 continue;
2989 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2990 << getBlockName(ChainBB) << "\n");
2991 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2992 << "\n");
2993 auto Dl = ChainBB->findBranchDebugLoc();
2994 TII->removeBranch(*ChainBB);
2995 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
2996 }
2997}
2998
2999void MachineBlockPlacement::alignBlocks() {
3000 // Walk through the backedges of the function now that we have fully laid out
3001 // the basic blocks and align the destination of each backedge. We don't rely
3002 // exclusively on the loop info here so that we can align backedges in
3003 // unnatural CFGs and backedges that were introduced purely because of the
3004 // loop rotations done during this layout pass.
3006 if (F->getFunction().hasMinSize() ||
3007 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3008 return;
3009 }
3010
3011 BlockChain &FunctionChain = *BlockToChain[&F->front()];
3012 // Empty chain.
3013 if (FunctionChain.begin() == FunctionChain.end())
3014 return;
3015
3016 const BranchProbability ColdProb(1, 5); // 20%
3017 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
3018 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3019 for (MachineBasicBlock *ChainBB : FunctionChain) {
3020 if (ChainBB == *FunctionChain.begin())
3021 continue;
3022
3023 // Don't align non-looping basic blocks. These are unlikely to execute
3024 // enough times to matter in practice. Note that we'll still handle
3025 // unnatural CFGs inside of a natural outer loop (the common case) and
3026 // rotated loops.
3027 MachineLoop *L = MLI->getLoopFor(ChainBB);
3028 if (!L)
3029 continue;
3030
3031 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
3032 unsigned MDAlign = 1;
3033 MDNode *LoopID = L->getLoopID();
3034 if (LoopID) {
3035 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
3036 MDNode *MD = dyn_cast<MDNode>(MDO);
3037 if (MD == nullptr)
3038 continue;
3039 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
3040 if (S == nullptr)
3041 continue;
3042 if (S->getString() == "llvm.loop.align") {
3043 assert(MD->getNumOperands() == 2 &&
3044 "per-loop align metadata should have two operands.");
3045 MDAlign =
3046 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
3047 assert(MDAlign >= 1 && "per-loop align value must be positive.");
3048 }
3049 }
3050 }
3051
3052 // Use max of the TLIAlign and MDAlign
3053 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
3054 if (LoopAlign == 1)
3055 continue; // Don't care about loop alignment.
3056
3057 // If the block is cold relative to the function entry don't waste space
3058 // aligning it.
3059 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3060 if (Freq < WeightedEntryFreq)
3061 continue;
3062
3063 // If the block is cold relative to its loop header, don't align it
3064 // regardless of what edges into the block exist.
3065 MachineBasicBlock *LoopHeader = L->getHeader();
3066 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3067 if (Freq < (LoopHeaderFreq * ColdProb))
3068 continue;
3069
3070 // If the global profiles indicates so, don't align it.
3071 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3072 !TLI->alignLoopsWithOptSize())
3073 continue;
3074
3075 // Check for the existence of a non-layout predecessor which would benefit
3076 // from aligning this block.
3077 MachineBasicBlock *LayoutPred =
3078 &*std::prev(MachineFunction::iterator(ChainBB));
3079
3080 auto DetermineMaxAlignmentPadding = [&]() {
3081 // Set the maximum bytes allowed to be emitted for alignment.
3082 unsigned MaxBytes;
3083 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3085 else
3086 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3087 ChainBB->setMaxBytesForAlignment(MaxBytes);
3088 };
3089
3090 // Force alignment if all the predecessors are jumps. We already checked
3091 // that the block isn't cold above.
3092 if (!LayoutPred->isSuccessor(ChainBB)) {
3093 ChainBB->setAlignment(LoopAlign);
3094 DetermineMaxAlignmentPadding();
3095 continue;
3096 }
3097
3098 // Align this block if the layout predecessor's edge into this block is
3099 // cold relative to the block. When this is true, other predecessors make up
3100 // all of the hot entries into the block and thus alignment is likely to be
3101 // important.
3102 BranchProbability LayoutProb =
3103 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3104 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3105 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3106 ChainBB->setAlignment(LoopAlign);
3107 DetermineMaxAlignmentPadding();
3108 }
3109 }
3110
3111 const bool HasMaxBytesOverride =
3112 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3113
3114 if (AlignAllBlock)
3115 // Align all of the blocks in the function to a specific alignment.
3116 for (MachineBasicBlock &MBB : *F) {
3117 if (HasMaxBytesOverride)
3120 else
3122 }
3123 else if (AlignAllNonFallThruBlocks) {
3124 // Align all of the blocks that have no fall-through predecessors to a
3125 // specific alignment.
3126 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3127 auto LayoutPred = std::prev(MBI);
3128 if (!LayoutPred->isSuccessor(&*MBI)) {
3129 if (HasMaxBytesOverride)
3130 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3132 else
3133 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3134 }
3135 }
3136 }
3137}
3138
3139/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3140/// it was duplicated into its chain predecessor and removed.
3141/// \p BB - Basic block that may be duplicated.
3142///
3143/// \p LPred - Chosen layout predecessor of \p BB.
3144/// Updated to be the chain end if LPred is removed.
3145/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3146/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3147/// Used to identify which blocks to update predecessor
3148/// counts.
3149/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3150/// chosen in the given order due to unnatural CFG
3151/// only needed if \p BB is removed and
3152/// \p PrevUnplacedBlockIt pointed to \p BB.
3153/// @return true if \p BB was removed.
3154bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3156 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3157 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3158 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3159 bool Removed, DuplicatedToLPred;
3160 bool DuplicatedToOriginalLPred;
3161 Removed = maybeTailDuplicateBlock(
3162 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3163 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3164 if (!Removed)
3165 return false;
3166 DuplicatedToOriginalLPred = DuplicatedToLPred;
3167 // Iteratively try to duplicate again. It can happen that a block that is
3168 // duplicated into is still small enough to be duplicated again.
3169 // No need to call markBlockSuccessors in this case, as the blocks being
3170 // duplicated from here on are already scheduled.
3171 while (DuplicatedToLPred && Removed) {
3172 MachineBasicBlock *DupBB, *DupPred;
3173 // The removal callback causes Chain.end() to be updated when a block is
3174 // removed. On the first pass through the loop, the chain end should be the
3175 // same as it was on function entry. On subsequent passes, because we are
3176 // duplicating the block at the end of the chain, if it is removed the
3177 // chain will have shrunk by one block.
3178 BlockChain::iterator ChainEnd = Chain.end();
3179 DupBB = *(--ChainEnd);
3180 // Now try to duplicate again.
3181 if (ChainEnd == Chain.begin())
3182 break;
3183 DupPred = *std::prev(ChainEnd);
3184 Removed = maybeTailDuplicateBlock(
3185 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3186 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3187 }
3188 // If BB was duplicated into LPred, it is now scheduled. But because it was
3189 // removed, markChainSuccessors won't be called for its chain. Instead we
3190 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3191 // at the end because repeating the tail duplication can increase the number
3192 // of unscheduled predecessors.
3193 LPred = *std::prev(Chain.end());
3194 if (DuplicatedToOriginalLPred)
3195 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3196 return true;
3197}
3198
3199/// Tail duplicate \p BB into (some) predecessors if profitable.
3200/// \p BB - Basic block that may be duplicated
3201/// \p LPred - Chosen layout predecessor of \p BB
3202/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3203/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3204/// Used to identify which blocks to update predecessor
3205/// counts.
3206/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3207/// chosen in the given order due to unnatural CFG
3208/// only needed if \p BB is removed and
3209/// \p PrevUnplacedBlockIt pointed to \p BB.
3210/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3211/// \return - True if the block was duplicated into all preds and removed.
3212bool MachineBlockPlacement::maybeTailDuplicateBlock(
3213 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3214 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3215 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3216 bool &DuplicatedToLPred) {
3217 DuplicatedToLPred = false;
3218 if (!shouldTailDuplicate(BB))
3219 return false;
3220
3221 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3222 << "\n");
3223
3224 // This has to be a callback because none of it can be done after
3225 // BB is deleted.
3226 bool Removed = false;
3227 auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3228 // Signal to outer function
3229 Removed = true;
3230
3231 // Remove from the Chain and Chain Map
3232 if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) {
3233 It->second->remove(RemBB);
3234 BlockToChain.erase(It);
3235 }
3236
3237 // Handle the unplaced block iterator
3238 if (&(*PrevUnplacedBlockIt) == RemBB) {
3239 PrevUnplacedBlockIt++;
3240 }
3241
3242 // Handle the Work Lists
3243 if (RemBB->isEHPad()) {
3244 llvm::erase(EHPadWorkList, RemBB);
3245 } else {
3246 llvm::erase(BlockWorkList, RemBB);
3247 }
3248
3249 // Handle the filter set
3250 if (BlockFilter) {
3251 auto It = llvm::find(*BlockFilter, RemBB);
3252 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3253 // pointing to the same element as before.
3254 if (It != BlockFilter->end()) {
3255 if (It < PrevUnplacedBlockInFilterIt) {
3256 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3257 // BlockFilter is a SmallVector so all elements after RemBB are
3258 // shifted to the front by 1 after its deletion.
3259 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3260 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3261 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3262 (void)PrevBB;
3263 } else if (It == PrevUnplacedBlockInFilterIt)
3264 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3265 // have to set it to the next element.
3266 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3267 else
3268 BlockFilter->erase(It);
3269 }
3270 }
3271
3272 // Remove the block from loop info.
3273 MLI->removeBlock(RemBB);
3274 if (RemBB == PreferredLoopExit)
3275 PreferredLoopExit = nullptr;
3276
3277 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3278 << "\n");
3279 };
3280 auto RemovalCallbackRef =
3282
3284 bool IsSimple = TailDup.isSimpleBB(BB);
3286 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3287 if (F->getFunction().hasProfileData()) {
3288 // We can do partial duplication with precise profile information.
3289 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3290 if (CandidatePreds.size() == 0)
3291 return false;
3292 if (CandidatePreds.size() < BB->pred_size())
3293 CandidatePtr = &CandidatePreds;
3294 }
3295 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3296 &RemovalCallbackRef, CandidatePtr);
3297
3298 // Update UnscheduledPredecessors to reflect tail-duplication.
3299 DuplicatedToLPred = false;
3300 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3301 // We're only looking for unscheduled predecessors that match the filter.
3302 BlockChain *PredChain = BlockToChain[Pred];
3303 if (Pred == LPred)
3304 DuplicatedToLPred = true;
3305 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
3306 PredChain == &Chain)
3307 continue;
3308 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3309 if (BlockFilter && !BlockFilter->count(NewSucc))
3310 continue;
3311 BlockChain *NewChain = BlockToChain[NewSucc];
3312 if (NewChain != &Chain && NewChain != PredChain)
3313 NewChain->UnscheduledPredecessors++;
3314 }
3315 }
3316 return Removed;
3317}
3318
3319// Count the number of actual machine instructions.
3321 uint64_t InstrCount = 0;
3322 for (MachineInstr &MI : *MBB) {
3323 if (!MI.isPHI() && !MI.isMetaInstruction())
3324 InstrCount += 1;
3325 }
3326 return InstrCount;
3327}
3328
3329// The size cost of duplication is the instruction size of the duplicated block.
3330// So we should scale the threshold accordingly. But the instruction size is not
3331// available on all targets, so we use the number of instructions instead.
3332BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3333 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3334}
3335
3336// Returns true if BB is Pred's best successor.
3337bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3338 MachineBasicBlock *Pred,
3339 BlockFilterSet *BlockFilter) {
3340 if (BB == Pred)
3341 return false;
3342 if (BlockFilter && !BlockFilter->count(Pred))
3343 return false;
3344 BlockChain *PredChain = BlockToChain[Pred];
3345 if (PredChain && (Pred != *std::prev(PredChain->end())))
3346 return false;
3347
3348 // Find the successor with largest probability excluding BB.
3350 for (MachineBasicBlock *Succ : Pred->successors())
3351 if (Succ != BB) {
3352 if (BlockFilter && !BlockFilter->count(Succ))
3353 continue;
3354 BlockChain *SuccChain = BlockToChain[Succ];
3355 if (SuccChain && (Succ != *SuccChain->begin()))
3356 continue;
3357 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3358 if (SuccProb > BestProb)
3359 BestProb = SuccProb;
3360 }
3361
3362 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3363 if (BBProb <= BestProb)
3364 return false;
3365
3366 // Compute the number of reduced taken branches if Pred falls through to BB
3367 // instead of another successor. Then compare it with threshold.
3368 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3369 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3370 return Gain > scaleThreshold(BB);
3371}
3372
3373// Find out the predecessors of BB and BB can be beneficially duplicated into
3374// them.
3375void MachineBlockPlacement::findDuplicateCandidates(
3377 BlockFilterSet *BlockFilter) {
3378 MachineBasicBlock *Fallthrough = nullptr;
3379 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3380 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3383
3384 // Sort for highest frequency.
3385 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3386 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3387 };
3388 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3389 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3390 };
3391 llvm::stable_sort(Succs, CmpSucc);
3392 llvm::stable_sort(Preds, CmpPred);
3393
3394 auto SuccIt = Succs.begin();
3395 if (SuccIt != Succs.end()) {
3396 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3397 }
3398
3399 // For each predecessors of BB, compute the benefit of duplicating BB,
3400 // if it is larger than the threshold, add it into Candidates.
3401 //
3402 // If we have following control flow.
3403 //
3404 // PB1 PB2 PB3 PB4
3405 // \ | / /\
3406 // \ | / / \
3407 // \ |/ / \
3408 // BB----/ OB
3409 // /\
3410 // / \
3411 // SB1 SB2
3412 //
3413 // And it can be partially duplicated as
3414 //
3415 // PB2+BB
3416 // | PB1 PB3 PB4
3417 // | | / /\
3418 // | | / / \
3419 // | |/ / \
3420 // | BB----/ OB
3421 // |\ /|
3422 // | X |
3423 // |/ \|
3424 // SB2 SB1
3425 //
3426 // The benefit of duplicating into a predecessor is defined as
3427 // Orig_taken_branch - Duplicated_taken_branch
3428 //
3429 // The Orig_taken_branch is computed with the assumption that predecessor
3430 // jumps to BB and the most possible successor is laid out after BB.
3431 //
3432 // The Duplicated_taken_branch is computed with the assumption that BB is
3433 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3434 // SB2 for PB2 in our case). If there is no available successor, the combined
3435 // block jumps to all BB's successor, like PB3 in this example.
3436 //
3437 // If a predecessor has multiple successors, so BB can't be duplicated into
3438 // it. But it can beneficially fall through to BB, and duplicate BB into other
3439 // predecessors.
3440 for (MachineBasicBlock *Pred : Preds) {
3441 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3442
3443 if (!TailDup.canTailDuplicate(BB, Pred)) {
3444 // BB can't be duplicated into Pred, but it is possible to be layout
3445 // below Pred.
3446 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3447 Fallthrough = Pred;
3448 if (SuccIt != Succs.end())
3449 SuccIt++;
3450 }
3451 continue;
3452 }
3453
3454 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3455 BlockFrequency DupCost;
3456 if (SuccIt == Succs.end()) {
3457 // Jump to all successors;
3458 if (Succs.size() > 0)
3459 DupCost += PredFreq;
3460 } else {
3461 // Fallthrough to *SuccIt, jump to all other successors;
3462 DupCost += PredFreq;
3463 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3464 }
3465
3466 assert(OrigCost >= DupCost);
3467 OrigCost -= DupCost;
3468 if (OrigCost > BBDupThreshold) {
3469 Candidates.push_back(Pred);
3470 if (SuccIt != Succs.end())
3471 SuccIt++;
3472 }
3473 }
3474
3475 // No predecessors can optimally fallthrough to BB.
3476 // So we can change one duplication into fallthrough.
3477 if (!Fallthrough) {
3478 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3479 Candidates[0] = Candidates.back();
3480 Candidates.pop_back();
3481 }
3482 }
3483}
3484
3485void MachineBlockPlacement::initTailDupThreshold() {
3486 DupThreshold = BlockFrequency(0);
3487 if (F->getFunction().hasProfileData()) {
3488 // We prefer to use prifile count.
3489 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3490 if (HotThreshold != UINT64_MAX) {
3491 UseProfileCount = true;
3492 DupThreshold =
3493 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3494 } else {
3495 // Profile count is not available, we can use block frequency instead.
3496 BlockFrequency MaxFreq = BlockFrequency(0);
3497 for (MachineBasicBlock &MBB : *F) {
3498 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3499 if (Freq > MaxFreq)
3500 MaxFreq = Freq;
3501 }
3502
3503 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3504 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3505 UseProfileCount = false;
3506 }
3507 }
3508
3509 TailDupSize = TailDupPlacementThreshold;
3510 // If only the aggressive threshold is explicitly set, use it.
3511 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3512 TailDupPlacementThreshold.getNumOccurrences() == 0)
3514
3515 // For aggressive optimization, we can adjust some thresholds to be less
3516 // conservative.
3517 if (OptLevel >= CodeGenOptLevel::Aggressive) {
3518 // At O3 we should be more willing to copy blocks for tail duplication. This
3519 // increases size pressure, so we only do it at O3
3520 // Do this unless only the regular threshold is explicitly set.
3521 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3522 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3524 }
3525
3526 // If there's no threshold provided through options, query the target
3527 // information for a threshold instead.
3528 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3529 (OptLevel < CodeGenOptLevel::Aggressive ||
3530 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3531 TailDupSize = TII->getTailDuplicateSize(OptLevel);
3532}
3533
3537 auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3538 auto MBFI = std::make_unique<MBFIWrapper>(
3540 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3541 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3543 : nullptr;
3545 .getCachedResult<ProfileSummaryAnalysis>(
3546 *MF.getFunction().getParent());
3547 if (!PSI)
3548 report_fatal_error("MachineBlockPlacement requires ProfileSummaryAnalysis",
3549 false);
3550
3551 MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3552 AllowTailMerge);
3553
3554 if (!MBP.run(MF))
3555 return PreservedAnalyses::all();
3556
3558}
3559
3561 raw_ostream &OS,
3562 function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3563 OS << MapClassName2PassName(name());
3564 if (!AllowTailMerge)
3565 OS << "<no-tail-merge>";
3566}
3567
3568bool MachineBlockPlacement::run(MachineFunction &MF) {
3569
3570 // Check for single-block functions and skip them.
3571 if (std::next(MF.begin()) == MF.end())
3572 return false;
3573
3574 F = &MF;
3575 OptLevel = F->getTarget().getOptLevel();
3576
3577 TII = MF.getSubtarget().getInstrInfo();
3578 TLI = MF.getSubtarget().getTargetLowering();
3579
3580 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3581 // there are no MachineLoops.
3582 PreferredLoopExit = nullptr;
3583
3584 assert(BlockToChain.empty() &&
3585 "BlockToChain map should be empty before starting placement.");
3586 assert(ComputedEdges.empty() &&
3587 "Computed Edge map should be empty before starting placement.");
3588
3589 // Initialize tail duplication thresholds.
3590 initTailDupThreshold();
3591
3592 const bool OptForSize =
3593 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3594 // Determine whether to use ext-tsp for perf/size optimization. The method
3595 // is beneficial only for instances with at least 3 basic blocks and it can be
3596 // disabled for huge functions (exceeding a certain size).
3597 bool UseExtTspForPerf = false;
3598 bool UseExtTspForSize = false;
3599 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3600 UseExtTspForPerf =
3603 UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3604 }
3605
3606 // Apply tail duplication.
3607 if (allowTailDupPlacement(*F)) {
3608 if (OptForSize)
3609 TailDupSize = 1;
3610 const bool PreRegAlloc = false;
3611 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3612 /* LayoutMode */ true, TailDupSize);
3613 if (!UseExtTspForSize)
3614 precomputeTriangleChains();
3615 }
3616
3617 // Run the main block placement.
3618 if (!UseExtTspForSize)
3619 buildCFGChains();
3620
3621 // Changing the layout can create new tail merging opportunities.
3622 // TailMerge can create jump into if branches that make CFG irreducible for
3623 // HW that requires structured CFG.
3624 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3625 AllowTailMerge && BranchFoldPlacement &&
3626 MF.size() > 3;
3627 // No tail merging opportunities if the block number is less than four.
3628 if (EnableTailMerge) {
3629 const unsigned TailMergeSize = TailDupSize + 1;
3630 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3631 *MBFI, *MBPI, PSI, TailMergeSize);
3632
3633 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3634 /*AfterPlacement=*/true)) {
3635 // Must redo the post-dominator tree if blocks were changed.
3636 if (MPDT)
3637 MPDT->recalculate(MF);
3638 if (!UseExtTspForSize) {
3639 // Redo the layout if tail merging creates/removes/moves blocks.
3640 BlockToChain.clear();
3641 ComputedEdges.clear();
3642 ChainAllocator.DestroyAll();
3643 buildCFGChains();
3644 }
3645 }
3646 }
3647
3648 // Apply a post-processing optimizing block placement:
3649 // - find a new placement and modify the layout of the blocks in the function;
3650 // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3651 if (UseExtTspForPerf || UseExtTspForSize) {
3652 assert(
3653 !(UseExtTspForPerf && UseExtTspForSize) &&
3654 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3655 applyExtTsp(/*OptForSize=*/UseExtTspForSize);
3656 createCFGChainExtTsp();
3657 }
3658
3659 optimizeBranches();
3660 alignBlocks();
3661
3662 BlockToChain.clear();
3663 ComputedEdges.clear();
3664 ChainAllocator.DestroyAll();
3665
3666 // View the function.
3668 (ViewBlockFreqFuncName.empty() ||
3669 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3671 MF.RenumberBlocks();
3672 MBFI->view("MBP." + MF.getName(), false);
3673 }
3674
3675 // We always return true as we have no way to track whether the final order
3676 // differs from the original order.
3677 return true;
3678}
3679
3680void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3681 // Prepare data; blocks are indexed by their index in the current ordering.
3683 BlockIndex.reserve(F->size());
3684 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3685 CurrentBlockOrder.reserve(F->size());
3686 size_t NumBlocks = 0;
3687 for (const MachineBasicBlock &MBB : *F) {
3688 BlockIndex[&MBB] = NumBlocks++;
3689 CurrentBlockOrder.push_back(&MBB);
3690 }
3691
3692 SmallVector<uint64_t, 0> BlockCounts(F->size());
3693 SmallVector<uint64_t, 0> BlockSizes(F->size());
3695 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3697 for (MachineBasicBlock &MBB : *F) {
3698 // Getting the block frequency.
3699 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3700 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
3701 // Getting the block size:
3702 // - approximate the size of an instruction by 4 bytes, and
3703 // - ignore debug instructions.
3704 // Note: getting the exact size of each block is target-dependent and can be
3705 // done by extending the interface of MCCodeEmitter. Experimentally we do
3706 // not see a perf improvement with the exact block sizes.
3707 auto NonDbgInsts =
3709 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3710 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3711
3712 // Getting jump frequencies.
3713 if (OptForSize) {
3714 Cond.clear();
3715 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3716 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3717 continue;
3718
3719 const MachineBasicBlock *FTB = MBB.getFallThrough();
3720 // Succs is a collection of distinct destinations of the block reachable
3721 // from MBB via a jump instruction; initialize the list using the three
3722 // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3723 Succs.clear();
3724 if (TBB && TBB != FTB)
3725 Succs.push_back(TBB);
3726 if (FBB && FBB != FTB)
3727 Succs.push_back(FBB);
3728 if (FTB)
3729 Succs.push_back(FTB);
3730 // Absolute magnitude of non-zero counts does not matter for the
3731 // optimization; prioritize slightly jumps with a single successor, since
3732 // the corresponding jump instruction will be removed from the binary.
3733 const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3734 for (const MachineBasicBlock *Succ : Succs)
3735 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3736 } else {
3737 for (MachineBasicBlock *Succ : MBB.successors()) {
3738 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3739 BlockFrequency JumpFreq = BlockFreq * EP;
3740 JumpCounts.push_back(
3741 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3742 }
3743 }
3744 }
3745
3746 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3747 << " with profile = " << F->getFunction().hasProfileData()
3748 << " (" << F->getName() << ")" << "\n");
3749
3750 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3751 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3752
3753 // Run the layout algorithm.
3754 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3755 std::vector<const MachineBasicBlock *> NewBlockOrder;
3756 NewBlockOrder.reserve(F->size());
3757 for (uint64_t Node : NewOrder) {
3758 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3759 }
3760 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3761 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3762
3763 // If the optimization is unsuccessful, fall back to the original block order.
3764 if (OptForSize && OrgScore > OptScore)
3765 assignBlockOrder(CurrentBlockOrder);
3766 else
3767 assignBlockOrder(NewBlockOrder);
3768}
3769
3770void MachineBlockPlacement::assignBlockOrder(
3771 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3772 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3773 F->RenumberBlocks();
3774 // At this point, we possibly removed blocks from the function, so we can't
3775 // renumber the domtree. At this point, we don't need it anymore, though.
3776 // TODO: move this to the point where the dominator tree is actually
3777 // invalidated (i.e., where blocks are removed without updating the domtree).
3778 MPDT = nullptr;
3779
3780 bool HasChanges = false;
3781 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3782 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3783 HasChanges = true;
3784 break;
3785 }
3786 }
3787 // Stop early if the new block order is identical to the existing one.
3788 if (!HasChanges)
3789 return;
3790
3791 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3792 for (auto &MBB : *F) {
3793 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3794 }
3795
3796 // Sort basic blocks in the function according to the computed order.
3798 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3799 NewIndex[MBB] = NewIndex.size();
3800 }
3801 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3802 return NewIndex[&L] < NewIndex[&R];
3803 });
3804
3805 // Update basic block branches by inserting explicit fallthrough branches
3806 // when required and re-optimize branches when possible.
3807 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3809 for (auto &MBB : *F) {
3810 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3812 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3813 // If this block had a fallthrough before we need an explicit unconditional
3814 // branch to that block if the fallthrough block is not adjacent to the
3815 // block in the new order.
3816 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3817 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3818 }
3819
3820 // It might be possible to optimize branches by flipping the condition.
3821 Cond.clear();
3822 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3823 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3824 continue;
3825 MBB.updateTerminator(FTMBB);
3826 }
3827}
3828
3829void MachineBlockPlacement::createCFGChainExtTsp() {
3830 BlockToChain.clear();
3831 ComputedEdges.clear();
3832 ChainAllocator.DestroyAll();
3833
3834 MachineBasicBlock *HeadBB = &F->front();
3835 BlockChain *FunctionChain =
3836 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3837
3838 for (MachineBasicBlock &MBB : *F) {
3839 if (HeadBB == &MBB)
3840 continue; // Ignore head of the chain
3841 FunctionChain->merge(&MBB, nullptr);
3842 }
3843}
3844
3845namespace {
3846
3847/// A pass to compute block placement statistics.
3848///
3849/// A separate pass to compute interesting statistics for evaluating block
3850/// placement. This is separate from the actual placement pass so that they can
3851/// be computed in the absence of any placement transformations or when using
3852/// alternative placement strategies.
3853class MachineBlockPlacementStats {
3854 /// A handle to the branch probability pass.
3855 const MachineBranchProbabilityInfo *MBPI;
3856
3857 /// A handle to the function-wide block frequency pass.
3858 const MachineBlockFrequencyInfo *MBFI;
3859
3860public:
3861 MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3862 const MachineBlockFrequencyInfo *MBFI)
3863 : MBPI(MBPI), MBFI(MBFI) {}
3864 bool run(MachineFunction &MF);
3865};
3866
3867class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3868public:
3869 static char ID; // Pass identification, replacement for typeid
3870
3871 MachineBlockPlacementStatsLegacy() : MachineFunctionPass(ID) {
3874 }
3875
3876 bool runOnMachineFunction(MachineFunction &F) override {
3877 auto *MBPI =
3878 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3879 auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3880 return MachineBlockPlacementStats(MBPI, MBFI).run(F);
3881 }
3882
3883 void getAnalysisUsage(AnalysisUsage &AU) const override {
3886 AU.setPreservesAll();
3888 }
3889};
3890
3891} // end anonymous namespace
3892
3893char MachineBlockPlacementStatsLegacy::ID = 0;
3894
3895char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3896
3897INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3898 "Basic Block Placement Stats", false, false)
3901INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3903
3907 auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3908 auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
3909
3910 MachineBlockPlacementStats(&MBPI, &MBFI).run(MF);
3911 return PreservedAnalyses::all();
3912}
3913
3914bool MachineBlockPlacementStats::run(MachineFunction &F) {
3915 // Check for single-block functions and skip them.
3916 if (std::next(F.begin()) == F.end())
3917 return false;
3918
3919 if (!isFunctionInPrintList(F.getName()))
3920 return false;
3921
3922 for (MachineBasicBlock &MBB : F) {
3923 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3924 Statistic &NumBranches =
3925 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3926 Statistic &BranchTakenFreq =
3927 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3928 for (MachineBasicBlock *Succ : MBB.successors()) {
3929 // Skip if this successor is a fallthrough.
3930 if (MBB.isLayoutSuccessor(Succ))
3931 continue;
3932
3933 BlockFrequency EdgeFreq =
3934 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3935 ++NumBranches;
3936 BranchTakenFreq += EdgeFreq.getFrequency();
3937 }
3938 }
3939
3940 return false;
3941}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638
static unsigned InstrCount
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
block placement Basic Block Placement Stats
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
Branch Probability Basic Block Placement
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > PredecessorLimit("block-placement-predecessor-limit", cl::desc("For blocks with more predecessors, certain layout optimizations" "will be disabled to prevent quadratic compile time."), cl::init(1000), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static cl::opt< bool > ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement."))
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunities in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
unify loop Fixup each natural loop to have a single exit block
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
bool erase(const KeyT &Val)
Definition: DenseMap.h:319
unsigned size() const
Definition: DenseMap.h:120
bool empty() const
Definition: DenseMap.h:119
iterator begin()
Definition: DenseMap.h:78
iterator end()
Definition: DenseMap.h:87
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:168
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:124
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition: Pass.cpp:188
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:334
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Metadata node.
Definition: Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1445
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1443
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1451
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:899
A single uniqued string.
Definition: Metadata.h:720
LLVM_ABI StringRef getString() const
Definition: Metadata.cpp:617
unsigned pred_size() const
bool isEHPad() const
Returns true if the block is a landing pad.
instr_iterator instr_begin()
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
unsigned succ_size() const
void setAlignment(Align A)
Set alignment of the basic block.
LLVM_ABI void dump() const
LLVM_ABI bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
LLVM_ABI DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
LLVM_ABI bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName) const
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Definition: MachineInstr.h:72
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:716
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
LLVM_ABI uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition: SetVector.h:71
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:579
typename SuperClass::iterator iterator
Definition: SmallVector.h:578
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:390
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:440
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:411
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Utility class to perform tail duplication.
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
TargetInstrInfo - Interface to description of machine instruction set.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
Target-Independent Code Generator Pass Configuration Options.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:134
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
#define UINT64_MAX
Definition: DataTypes.h:77
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
LLVM_ABI double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
LLVM_ABI std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:456
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &)
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
cl::opt< bool > ApplyExtTspWithoutProfile
constexpr from_range_t from_range
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
cl::opt< unsigned > ProfileLikelyProb
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2147
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
cl::opt< unsigned > StaticLikelyProb
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:126
LLVM_ABI void initializeMachineBlockPlacementLegacyPass(PassRegistry &)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1886
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2139
LLVM_ABI Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
LLVM_ABI char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
LLVM_ABI char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:856
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static StringRef name()
Gets the name of the pass we are mixed into.
Definition: PassManager.h:72