LLVM 22.0.0git
X86CmovConversion.cpp
Go to the documentation of this file.
1//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements a pass that converts X86 cmov instructions into
11/// branches when profitable. This pass is conservative. It transforms if and
12/// only if it can guarantee a gain with high confidence.
13///
14/// Thus, the optimization applies under the following conditions:
15/// 1. Consider as candidates only CMOVs in innermost loops (assume that
16/// most hotspots are represented by these loops).
17/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
18/// instruction:
19/// a. Consider them as candidates only if all have the same code condition
20/// or the opposite one to prevent generating more than one conditional
21/// jump per EFLAGS def instruction.
22/// b. Consider them as candidates only if all are profitable to be
23/// converted (assume that one bad conversion may cause a degradation).
24/// 3. Apply conversion only for loops that are found profitable and only for
25/// CMOV candidates that were found profitable.
26/// a. A loop is considered profitable only if conversion will reduce its
27/// depth cost by some threshold.
28/// b. CMOV is considered profitable if the cost of its condition is higher
29/// than the average cost of its true-value and false-value by 25% of
30/// branch-misprediction-penalty. This assures no degradation even with
31/// 25% branch misprediction.
32///
33/// Note: This pass is assumed to run on SSA machine code.
34//
35//===----------------------------------------------------------------------===//
36//
37// External interfaces:
38// FunctionPass *llvm::createX86CmovConverterPass();
39// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40//
41//===----------------------------------------------------------------------===//
42
43#include "X86.h"
44#include "X86InstrInfo.h"
45#include "llvm/ADT/ArrayRef.h"
46#include "llvm/ADT/DenseMap.h"
47#include "llvm/ADT/STLExtras.h"
50#include "llvm/ADT/Statistic.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/MC/MCSchedule.h"
66#include "llvm/Pass.h"
68#include "llvm/Support/Debug.h"
71#include <algorithm>
72#include <cassert>
73#include <iterator>
74#include <utility>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "x86-cmov-conversion"
79
80STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
81STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
82STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
83STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
84
85// This internal switch can be used to turn off the cmov/branch optimization.
86static cl::opt<bool>
87 EnableCmovConverter("x86-cmov-converter",
88 cl::desc("Enable the X86 cmov-to-branch optimization."),
89 cl::init(true), cl::Hidden);
90
92 GainCycleThreshold("x86-cmov-converter-threshold",
93 cl::desc("Minimum gain per loop (in cycles) threshold."),
95
97 "x86-cmov-converter-force-mem-operand",
98 cl::desc("Convert cmovs to branches whenever they have memory operands."),
99 cl::init(true), cl::Hidden);
100
102 "x86-cmov-converter-force-all",
103 cl::desc("Convert all cmovs to branches."),
104 cl::init(false), cl::Hidden);
105
106namespace {
107
108/// Converts X86 cmov instructions into branches when profitable.
109class X86CmovConverterPass : public MachineFunctionPass {
110public:
111 X86CmovConverterPass() : MachineFunctionPass(ID) { }
112
113 StringRef getPassName() const override { return "X86 cmov Conversion"; }
114 bool runOnMachineFunction(MachineFunction &MF) override;
115 void getAnalysisUsage(AnalysisUsage &AU) const override;
116
117 /// Pass identification, replacement for typeid.
118 static char ID;
119
120private:
121 MachineRegisterInfo *MRI = nullptr;
122 const TargetInstrInfo *TII = nullptr;
123 const TargetRegisterInfo *TRI = nullptr;
124 MachineLoopInfo *MLI = nullptr;
125 TargetSchedModel TSchedModel;
126
127 /// List of consecutive CMOV instructions.
128 using CmovGroup = SmallVector<MachineInstr *, 2>;
129 using CmovGroups = SmallVector<CmovGroup, 2>;
130
131 /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
132 /// CmovInstGroups accordingly.
133 ///
134 /// \param Blocks List of blocks to process.
135 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
136 /// \returns true iff it found any CMOV-group-candidate.
137 bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
138 CmovGroups &CmovInstGroups,
139 bool IncludeLoads = false);
140
141 /// Check if it is profitable to transform each CMOV-group-candidates into
142 /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
143 ///
144 /// \param Blocks List of blocks to process.
145 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
146 /// \returns true iff any CMOV-group-candidate remain.
147 bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
148 CmovGroups &CmovInstGroups);
149
150 /// Convert the given list of consecutive CMOV instructions into a branch.
151 ///
152 /// \param Group Consecutive CMOV instructions to be converted into branch.
153 void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
154};
155
156} // end anonymous namespace
157
158char X86CmovConverterPass::ID = 0;
159
160void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
163}
164
165bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
166 if (skipFunction(MF.getFunction()))
167 return false;
169 return false;
170
171 // If the SelectOptimize pass is enabled, cmovs have already been optimized.
173 return false;
174
175 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
176 << "**********\n");
177
178 bool Changed = false;
179 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
180 const TargetSubtargetInfo &STI = MF.getSubtarget();
181 MRI = &MF.getRegInfo();
182 TII = STI.getInstrInfo();
183 TRI = STI.getRegisterInfo();
184 TSchedModel.init(&STI);
185
186 // Before we handle the more subtle cases of register-register CMOVs inside
187 // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
188 // the ones with a memory operand (ForceMemOperand option). The latter CMOV
189 // will risk a stall waiting for the load to complete that speculative
190 // execution behind a branch is better suited to handle on modern x86 chips.
191 if (ForceMemOperand || ForceAll) {
192 CmovGroups AllCmovGroups;
194 if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
195 for (auto &Group : AllCmovGroups) {
196 // Skip any group that doesn't do at least one memory operand cmov.
197 if (ForceMemOperand && !ForceAll &&
198 llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
199 continue;
200
201 // For CMOV groups which we can rewrite and which contain a memory load,
202 // always rewrite them. On x86, a CMOV will dramatically amplify any
203 // memory latency by blocking speculative execution.
204 Changed = true;
205 convertCmovInstsToBranches(Group);
206 }
207 }
208 // Early return as ForceAll converts all CmovGroups.
209 if (ForceAll)
210 return Changed;
211 }
212
213 //===--------------------------------------------------------------------===//
214 // Register-operand Conversion Algorithm
215 // ---------
216 // For each innermost loop
217 // collectCmovCandidates() {
218 // Find all CMOV-group-candidates.
219 // }
220 //
221 // checkForProfitableCmovCandidates() {
222 // * Calculate both loop-depth and optimized-loop-depth.
223 // * Use these depth to check for loop transformation profitability.
224 // * Check for CMOV-group-candidate transformation profitability.
225 // }
226 //
227 // For each profitable CMOV-group-candidate
228 // convertCmovInstsToBranches() {
229 // * Create FalseBB, SinkBB, Conditional branch to SinkBB.
230 // * Replace each CMOV instruction with a PHI instruction in SinkBB.
231 // }
232 //
233 // Note: For more details, see each function description.
234 //===--------------------------------------------------------------------===//
235
236 // Build up the loops in pre-order.
237 SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end());
238 // Note that we need to check size on each iteration as we accumulate child
239 // loops.
240 for (int i = 0; i < (int)Loops.size(); ++i)
241 llvm::append_range(Loops, Loops[i]->getSubLoops());
242
243 for (MachineLoop *CurrLoop : Loops) {
244 // Optimize only innermost loops.
245 if (!CurrLoop->getSubLoops().empty())
246 continue;
247
248 // List of consecutive CMOV instructions to be processed.
249 CmovGroups CmovInstGroups;
250
251 if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
252 continue;
253
254 if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
255 CmovInstGroups))
256 continue;
257
258 Changed = true;
259 for (auto &Group : CmovInstGroups)
260 convertCmovInstsToBranches(Group);
261 }
262
263 return Changed;
264}
265
266bool X86CmovConverterPass::collectCmovCandidates(
267 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
268 bool IncludeLoads) {
269 //===--------------------------------------------------------------------===//
270 // Collect all CMOV-group-candidates and add them into CmovInstGroups.
271 //
272 // CMOV-group:
273 // CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
274 //
275 // CMOV-group-candidate:
276 // CMOV-group where all the CMOV instructions are
277 // 1. consecutive.
278 // 2. have same condition code or opposite one.
279 // 3. have only operand registers (X86::CMOVrr).
280 //===--------------------------------------------------------------------===//
281 // List of possible improvement (TODO's):
282 // --------------------------------------
283 // TODO: Add support for X86::CMOVrm instructions.
284 // TODO: Add support for X86::SETcc instructions.
285 // TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
286 //===--------------------------------------------------------------------===//
287
288 // Current processed CMOV-Group.
289 CmovGroup Group;
290 for (auto *MBB : Blocks) {
291 Group.clear();
292 // Condition code of first CMOV instruction current processed range and its
293 // opposite condition code.
294 X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
295 MemOpCC = X86::COND_INVALID;
296 // Indicator of a non CMOVrr instruction in the current processed range.
297 bool FoundNonCMOVInst = false;
298 // Indicator for current processed CMOV-group if it should be skipped.
299 bool SkipGroup = false;
300
301 for (auto &I : *MBB) {
302 // Skip debug instructions.
303 if (I.isDebugInstr())
304 continue;
305
307 // Check if we found a X86::CMOVrr instruction. If it is marked as
308 // unpredictable, skip it and do not convert it to branch.
309 if (CC != X86::COND_INVALID &&
310 !I.getFlag(MachineInstr::MIFlag::Unpredictable) &&
311 (IncludeLoads || !I.mayLoad())) {
312 if (Group.empty()) {
313 // We found first CMOV in the range, reset flags.
314 FirstCC = CC;
315 FirstOppCC = X86::GetOppositeBranchCondition(CC);
316 // Clear out the prior group's memory operand CC.
317 MemOpCC = X86::COND_INVALID;
318 FoundNonCMOVInst = false;
319 SkipGroup = false;
320 }
321 Group.push_back(&I);
322 // Check if it is a non-consecutive CMOV instruction or it has different
323 // condition code than FirstCC or FirstOppCC.
324 if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
325 // Mark the SKipGroup indicator to skip current processed CMOV-Group.
326 SkipGroup = true;
327 if (I.mayLoad()) {
328 if (MemOpCC == X86::COND_INVALID)
329 // The first memory operand CMOV.
330 MemOpCC = CC;
331 else if (CC != MemOpCC)
332 // Can't handle mixed conditions with memory operands.
333 SkipGroup = true;
334 }
335 // Check if we were relying on zero-extending behavior of the CMOV.
336 if (!SkipGroup &&
338 MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
339 [&](MachineInstr &UseI) {
340 return UseI.getOpcode() == X86::SUBREG_TO_REG;
341 }))
342 // FIXME: We should model the cost of using an explicit MOV to handle
343 // the zero-extension rather than just refusing to handle this.
344 SkipGroup = true;
345 continue;
346 }
347 // If Group is empty, keep looking for first CMOV in the range.
348 if (Group.empty())
349 continue;
350
351 // We found a non X86::CMOVrr instruction.
352 FoundNonCMOVInst = true;
353 // Check if this instruction define EFLAGS, to determine end of processed
354 // range, as there would be no more instructions using current EFLAGS def.
355 if (I.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) {
356 // Check if current processed CMOV-group should not be skipped and add
357 // it as a CMOV-group-candidate.
358 if (!SkipGroup)
359 CmovInstGroups.push_back(Group);
360 else
361 ++NumOfSkippedCmovGroups;
362 Group.clear();
363 }
364 }
365 // End of basic block is considered end of range, check if current processed
366 // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
367 if (Group.empty())
368 continue;
369 if (!SkipGroup)
370 CmovInstGroups.push_back(Group);
371 else
372 ++NumOfSkippedCmovGroups;
373 }
374
375 NumOfCmovGroupCandidate += CmovInstGroups.size();
376 return !CmovInstGroups.empty();
377}
378
379/// \returns Depth of CMOV instruction as if it was converted into branch.
380/// \param TrueOpDepth depth cost of CMOV true value operand.
381/// \param FalseOpDepth depth cost of CMOV false value operand.
382static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
383 // The depth of the result after branch conversion is
384 // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.
385 // As we have no info about branch weight, we assume 75% for one and 25% for
386 // the other, and pick the result with the largest resulting depth.
387 return std::max(
388 divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),
389 divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));
390}
391
392bool X86CmovConverterPass::checkForProfitableCmovCandidates(
393 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
394 struct DepthInfo {
395 /// Depth of original loop.
396 unsigned Depth;
397 /// Depth of optimized loop.
398 unsigned OptDepth;
399 };
400 /// Number of loop iterations to calculate depth for ?!
401 static const unsigned LoopIterations = 2;
403 DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
404 enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
405 /// For each register type maps the register to its last def instruction.
406 DenseMap<Register, MachineInstr *> RegDefMaps[RegTypeNum];
407 /// Maps register operand to its def instruction, which can be nullptr if it
408 /// is unknown (e.g., operand is defined outside the loop).
410
411 // Set depth of unknown instruction (i.e., nullptr) to zero.
412 DepthMap[nullptr] = {0, 0};
413
414 SmallPtrSet<MachineInstr *, 4> CmovInstructions;
415 for (auto &Group : CmovInstGroups)
416 CmovInstructions.insert_range(Group);
417
418 //===--------------------------------------------------------------------===//
419 // Step 1: Calculate instruction depth and loop depth.
420 // Optimized-Loop:
421 // loop with CMOV-group-candidates converted into branches.
422 //
423 // Instruction-Depth:
424 // instruction latency + max operand depth.
425 // * For CMOV instruction in optimized loop the depth is calculated as:
426 // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
427 // TODO: Find a better way to estimate the latency of the branch instruction
428 // rather than using the CMOV latency.
429 //
430 // Loop-Depth:
431 // max instruction depth of all instructions in the loop.
432 // Note: instruction with max depth represents the critical-path in the loop.
433 //
434 // Loop-Depth[i]:
435 // Loop-Depth calculated for first `i` iterations.
436 // Note: it is enough to calculate depth for up to two iterations.
437 //
438 // Depth-Diff[i]:
439 // Number of cycles saved in first 'i` iterations by optimizing the loop.
440 //===--------------------------------------------------------------------===//
441 for (DepthInfo &MaxDepth : LoopDepth) {
442 for (auto *MBB : Blocks) {
443 // Clear physical registers Def map.
444 RegDefMaps[PhyRegType].clear();
445 for (MachineInstr &MI : *MBB) {
446 // Skip debug instructions.
447 if (MI.isDebugInstr())
448 continue;
449 unsigned MIDepth = 0;
450 unsigned MIDepthOpt = 0;
451 bool IsCMOV = CmovInstructions.count(&MI);
452 for (auto &MO : MI.uses()) {
453 // Checks for "isUse()" as "uses()" returns also implicit definitions.
454 if (!MO.isReg() || !MO.isUse())
455 continue;
456 Register Reg = MO.getReg();
457 auto &RDM = RegDefMaps[Reg.isVirtual()];
458 if (MachineInstr *DefMI = RDM.lookup(Reg)) {
459 OperandToDefMap[&MO] = DefMI;
460 DepthInfo Info = DepthMap.lookup(DefMI);
461 MIDepth = std::max(MIDepth, Info.Depth);
462 if (!IsCMOV)
463 MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
464 }
465 }
466
467 if (IsCMOV)
468 MIDepthOpt = getDepthOfOptCmov(
469 DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
470 DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
471
472 // Iterates over all operands to handle implicit definitions as well.
473 for (auto &MO : MI.operands()) {
474 if (!MO.isReg() || !MO.isDef())
475 continue;
476 Register Reg = MO.getReg();
477 RegDefMaps[Reg.isVirtual()][Reg] = &MI;
478 }
479
480 unsigned Latency = TSchedModel.computeInstrLatency(&MI);
481 DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
482 MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
483 MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
484 }
485 }
486 }
487
488 unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
489 LoopDepth[1].Depth - LoopDepth[1].OptDepth};
490
491 //===--------------------------------------------------------------------===//
492 // Step 2: Check if Loop worth to be optimized.
493 // Worth-Optimize-Loop:
494 // case 1: Diff[1] == Diff[0]
495 // Critical-path is iteration independent - there is no dependency
496 // of critical-path instructions on critical-path instructions of
497 // previous iteration.
498 // Thus, it is enough to check gain percent of 1st iteration -
499 // To be conservative, the optimized loop need to have a depth of
500 // 12.5% cycles less than original loop, per iteration.
501 //
502 // case 2: Diff[1] > Diff[0]
503 // Critical-path is iteration dependent - there is dependency of
504 // critical-path instructions on critical-path instructions of
505 // previous iteration.
506 // Thus, check the gain percent of the 2nd iteration (similar to the
507 // previous case), but it is also required to check the gradient of
508 // the gain - the change in Depth-Diff compared to the change in
509 // Loop-Depth between 1st and 2nd iterations.
510 // To be conservative, the gradient need to be at least 50%.
511 //
512 // In addition, In order not to optimize loops with very small gain, the
513 // gain (in cycles) after 2nd iteration should not be less than a given
514 // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
515 //
516 // If loop is not worth optimizing, remove all CMOV-group-candidates.
517 //===--------------------------------------------------------------------===//
518 if (Diff[1] < GainCycleThreshold)
519 return false;
520
521 bool WorthOptLoop = false;
522 if (Diff[1] == Diff[0])
523 WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
524 else if (Diff[1] > Diff[0])
525 WorthOptLoop =
526 (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
527 (Diff[1] * 8 >= LoopDepth[1].Depth);
528
529 if (!WorthOptLoop)
530 return false;
531
532 ++NumOfLoopCandidate;
533
534 //===--------------------------------------------------------------------===//
535 // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
536 // Worth-Optimize-Group:
537 // Iff it is worth to optimize all CMOV instructions in the group.
538 //
539 // Worth-Optimize-CMOV:
540 // Predicted branch is faster than CMOV by the difference between depth of
541 // condition operand and depth of taken (predicted) value operand.
542 // To be conservative, the gain of such CMOV transformation should cover at
543 // at least 25% of branch-misprediction-penalty.
544 //===--------------------------------------------------------------------===//
545 unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
546 CmovGroups TempGroups;
547 std::swap(TempGroups, CmovInstGroups);
548 for (auto &Group : TempGroups) {
549 bool WorthOpGroup = true;
550 for (auto *MI : Group) {
551 // Avoid CMOV instruction which value is used as a pointer to load from.
552 // This is another conservative check to avoid converting CMOV instruction
553 // used with tree-search like algorithm, where the branch is unpredicted.
554 auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
555 if (hasSingleElement(UIs)) {
556 unsigned Op = UIs.begin()->getOpcode();
557 if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
558 WorthOpGroup = false;
559 break;
560 }
561 }
562
563 unsigned CondCost =
564 DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
565 unsigned ValCost = getDepthOfOptCmov(
566 DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
567 DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
568 if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
569 WorthOpGroup = false;
570 break;
571 }
572 }
573
574 if (WorthOpGroup)
575 CmovInstGroups.push_back(Group);
576 }
577
578 return !CmovInstGroups.empty();
579}
580
582 if (MI->killsRegister(X86::EFLAGS, /*TRI=*/nullptr))
583 return false;
584
585 // The EFLAGS operand of MI might be missing a kill marker.
586 // Figure out whether EFLAGS operand should LIVE after MI instruction.
587 MachineBasicBlock *BB = MI->getParent();
589
590 // Scan forward through BB for a use/def of EFLAGS.
591 for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
592 if (I->readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
593 return true;
594 if (I->definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
595 return false;
596 }
597
598 // We hit the end of the block, check whether EFLAGS is live into a successor.
599 for (MachineBasicBlock *Succ : BB->successors())
600 if (Succ->isLiveIn(X86::EFLAGS))
601 return true;
602
603 return false;
604}
605
606/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
607/// group of CMOV instructions, which may contain debug instructions in between,
608/// move all debug instructions to after the last CMOV instruction, making the
609/// CMOV group consecutive.
612 "Last instruction in a CMOV group must be a CMOV instruction");
613
614 SmallVector<MachineInstr *, 2> DBGInstructions;
615 for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
616 if (I->isDebugInstr())
617 DBGInstructions.push_back(&*I);
618 }
619
620 // Splice the debug instruction after the cmov group.
621 MachineBasicBlock *MBB = First->getParent();
622 for (auto *MI : DBGInstructions)
623 MBB->insertAfter(Last, MI->removeFromParent());
624}
625
626void X86CmovConverterPass::convertCmovInstsToBranches(
627 SmallVectorImpl<MachineInstr *> &Group) const {
628 assert(!Group.empty() && "No CMOV instructions to convert");
629 ++NumOfOptimizedCmovGroups;
630
631 // If the CMOV group is not packed, e.g., there are debug instructions between
632 // first CMOV and last CMOV, then pack the group and make the CMOV instruction
633 // consecutive by moving the debug instructions to after the last CMOV.
634 packCmovGroup(Group.front(), Group.back());
635
636 // To convert a CMOVcc instruction, we actually have to insert the diamond
637 // control-flow pattern. The incoming instruction knows the destination vreg
638 // to set, the condition code register to branch on, the true/false values to
639 // select between, and a branch opcode to use.
640
641 // Before
642 // -----
643 // MBB:
644 // cond = cmp ...
645 // v1 = CMOVge t1, f1, cond
646 // v2 = CMOVlt t2, f2, cond
647 // v3 = CMOVge v1, f3, cond
648 //
649 // After
650 // -----
651 // MBB:
652 // cond = cmp ...
653 // jge %SinkMBB
654 //
655 // FalseMBB:
656 // jmp %SinkMBB
657 //
658 // SinkMBB:
659 // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
660 // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
661 // ; true-value with false-value
662 // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
663 // ; previous Phi instruction result
664
665 MachineInstr &MI = *Group.front();
666 MachineInstr *LastCMOV = Group.back();
667 DebugLoc DL = MI.getDebugLoc();
668
671 // Potentially swap the condition codes so that any memory operand to a CMOV
672 // is in the *false* position instead of the *true* position. We can invert
673 // any non-memory operand CMOV instructions to cope with this and we ensure
674 // memory operand CMOVs are only included with a single condition code.
675 if (llvm::any_of(Group, [&](MachineInstr *I) {
676 return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
677 }))
678 std::swap(CC, OppCC);
679
680 MachineBasicBlock *MBB = MI.getParent();
683 const BasicBlock *BB = MBB->getBasicBlock();
684
685 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
686 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
687 F->insert(It, FalseMBB);
688 F->insert(It, SinkMBB);
689
690 // If the EFLAGS register isn't dead in the terminator, then claim that it's
691 // live into the sink and copy blocks.
692 if (checkEFLAGSLive(LastCMOV)) {
693 FalseMBB->addLiveIn(X86::EFLAGS);
694 SinkMBB->addLiveIn(X86::EFLAGS);
695 }
696
697 // Transfer the remainder of BB and its successor edges to SinkMBB.
698 SinkMBB->splice(SinkMBB->begin(), MBB,
699 std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
701
702 // Add the false and sink blocks as its successors.
703 MBB->addSuccessor(FalseMBB);
704 MBB->addSuccessor(SinkMBB);
705
706 // Create the conditional branch instruction.
707 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
708
709 // Add the sink block to the false block successors.
710 FalseMBB->addSuccessor(SinkMBB);
711
715 std::next(MachineBasicBlock::iterator(LastCMOV));
716 MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
717 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
718
719 // First we need to insert an explicit load on the false path for any memory
720 // operand. We also need to potentially do register rewriting here, but it is
721 // simpler as the memory operands are always on the false path so we can
722 // simply take that input, whatever it is.
723 DenseMap<Register, Register> FalseBBRegRewriteTable;
724 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
725 auto &MI = *MIIt++;
726 // Skip any CMOVs in this group which don't load from memory.
727 if (!MI.mayLoad()) {
728 // Remember the false-side register input.
729 Register FalseReg =
730 MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
731 // Walk back through any intermediate cmovs referenced.
732 while (true) {
733 auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
734 if (FRIt == FalseBBRegRewriteTable.end())
735 break;
736 FalseReg = FRIt->second;
737 }
738 FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
739 continue;
740 }
741
742 // The condition must be the *opposite* of the one we've decided to branch
743 // on as the branch will go *around* the load and the load should happen
744 // when the CMOV condition is false.
745 assert(X86::getCondFromCMov(MI) == OppCC &&
746 "Can only handle memory-operand cmov instructions with a condition "
747 "opposite to the selected branch direction.");
748
749 // The goal is to rewrite the cmov from:
750 //
751 // MBB:
752 // %A = CMOVcc %B (tied), (mem)
753 //
754 // to
755 //
756 // MBB:
757 // %A = CMOVcc %B (tied), %C
758 // FalseMBB:
759 // %C = MOV (mem)
760 //
761 // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
762 //
763 // MBB:
764 // JMP!cc SinkMBB
765 // FalseMBB:
766 // %C = MOV (mem)
767 // SinkMBB:
768 // %A = PHI [ %C, FalseMBB ], [ %B, MBB]
769
770 // Get a fresh register to use as the destination of the MOV.
771 const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
772 Register TmpReg = MRI->createVirtualRegister(RC);
773
774 // Retain debug instr number when unfolded.
775 unsigned OldDebugInstrNum = MI.peekDebugInstrNum();
777 bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
778 /*UnfoldLoad*/ true,
779 /*UnfoldStore*/ false, NewMIs);
780 (void)Unfolded;
781 assert(Unfolded && "Should never fail to unfold a loading cmov!");
782
783 // Move the new CMOV to just before the old one and reset any impacted
784 // iterator.
785 auto *NewCMOV = NewMIs.pop_back_val();
786 assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
787 "Last new instruction isn't the expected CMOV!");
788 LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
790 if (&*MIItBegin == &MI)
791 MIItBegin = MachineBasicBlock::iterator(NewCMOV);
792
793 if (OldDebugInstrNum)
794 NewCMOV->setDebugInstrNum(OldDebugInstrNum);
795
796 // Sink whatever instructions were needed to produce the unfolded operand
797 // into the false block.
798 for (auto *NewMI : NewMIs) {
799 LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
800 FalseMBB->insert(FalseInsertionPoint, NewMI);
801 // Re-map any operands that are from other cmovs to the inputs for this block.
802 for (auto &MOp : NewMI->uses()) {
803 if (!MOp.isReg())
804 continue;
805 auto It = FalseBBRegRewriteTable.find(MOp.getReg());
806 if (It == FalseBBRegRewriteTable.end())
807 continue;
808
809 MOp.setReg(It->second);
810 // This might have been a kill when it referenced the cmov result, but
811 // it won't necessarily be once rewritten.
812 // FIXME: We could potentially improve this by tracking whether the
813 // operand to the cmov was also a kill, and then skipping the PHI node
814 // construction below.
815 MOp.setIsKill(false);
816 }
817 }
818 MBB->erase(&MI);
819
820 // Add this PHI to the rewrite table.
821 FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
822 }
823
824 // As we are creating the PHIs, we have to be careful if there is more than
825 // one. Later CMOVs may reference the results of earlier CMOVs, but later
826 // PHIs have to reference the individual true/false inputs from earlier PHIs.
827 // That also means that PHI construction must work forward from earlier to
828 // later, and that the code must maintain a mapping from earlier PHI's
829 // destination registers, and the registers that went into the PHI.
831
832 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
833 Register DestReg = MIIt->getOperand(0).getReg();
834 Register Op1Reg = MIIt->getOperand(1).getReg();
835 Register Op2Reg = MIIt->getOperand(2).getReg();
836
837 // If this CMOV we are processing is the opposite condition from the jump we
838 // generated, then we have to swap the operands for the PHI that is going to
839 // be generated.
840 if (X86::getCondFromCMov(*MIIt) == OppCC)
841 std::swap(Op1Reg, Op2Reg);
842
843 auto Op1Itr = RegRewriteTable.find(Op1Reg);
844 if (Op1Itr != RegRewriteTable.end())
845 Op1Reg = Op1Itr->second.first;
846
847 auto Op2Itr = RegRewriteTable.find(Op2Reg);
848 if (Op2Itr != RegRewriteTable.end())
849 Op2Reg = Op2Itr->second.second;
850
851 // SinkMBB:
852 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
853 // ...
854 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
855 .addReg(Op1Reg)
856 .addMBB(FalseMBB)
857 .addReg(Op2Reg)
858 .addMBB(MBB);
859 (void)MIB;
860 LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
861 LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
862
863 // debug-info: we can just copy the instr-ref number from one instruction
864 // to the other, seeing how it's a one-for-one substitution.
865 if (unsigned InstrNum = MIIt->peekDebugInstrNum())
866 MIB->setDebugInstrNum(InstrNum);
867
868 // Add this PHI to the rewrite table.
869 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
870 }
871
872 // Reset the NoPHIs property if a PHI was inserted to prevent a conflict with
873 // the MachineVerifier during testing.
874 if (MIItBegin != MIItEnd)
875 F->getProperties().resetNoPHIs();
876
877 // Now remove the CMOV(s).
878 MBB->erase(MIItBegin, MIItEnd);
879
880 // Add new basic blocks to MachineLoopInfo.
881 if (MachineLoop *L = MLI->getLoopFor(MBB)) {
882 L->addBasicBlockToLoop(FalseMBB, *MLI);
883 L->addBasicBlockToLoop(SinkMBB, *MLI);
884 }
885}
886
887INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
888 false, false)
890INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
892
894 return new X86CmovConverterPass();
895}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
const HexagonInstrInfo * TII
Hexagon Hardware Loops
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static cl::opt< bool > DisableSelectOptimize("disable-select-optimize", cl::init(true), cl::Hidden, cl::desc("Disable the select-optimization pass from running"))
Disable the select optimization pass.
X86 cmov Conversion
static cl::opt< bool > ForceAll("x86-cmov-converter-force-all", cl::desc("Convert all cmovs to branches."), cl::init(false), cl::Hidden)
static bool checkEFLAGSLive(MachineInstr *MI)
static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth)
static cl::opt< unsigned > GainCycleThreshold("x86-cmov-converter-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForceMemOperand("x86-cmov-converter-force-mem-operand", cl::desc("Convert cmovs to branches whenever they have memory operands."), cl::init(true), cl::Hidden)
static void packCmovGroup(MachineInstr *First, MachineInstr *Last)
Given /p First CMOV instruction and /p Last CMOV instruction representing a group of CMOV instruction...
static cl::opt< bool > EnableCmovConverter("x86-cmov-converter", cl::desc("Enable the X86 cmov-to-branch optimization."), cl::init(true), cl::Hidden)
#define DEBUG_TYPE
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
iterator end()
Definition: DenseMap.h:87
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:72
void setDebugInstrNum(unsigned Num)
Set instruction number of this MachineInstr.
Definition: MachineInstr.h:562
LLVM_ABI void dump() const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
self_iterator getIterator()
Definition: ilist_node.h:134
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Reg
All possible values of the reg field in the ModR/M byte.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
FunctionPass * createX86CmovConverterPass()
This pass converts X86 cmov instructions into branch when profitable.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
bool hasSingleElement(ContainerTy &&C)
Returns true if the given container only contains a single element.
Definition: STLExtras.h:322
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition: iterator.h:363
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858