LLVM 22.0.0git
SILowerI1Copies.cpp
Go to the documentation of this file.
1//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11// form and a wave-level control flow graph.
12//
13// Before this pass, values that are semantically i1 and are defined and used
14// within the same basic block are already represented as lane masks in scalar
15// registers. However, values that cross basic blocks are always transferred
16// between basic blocks in vreg_1 virtual registers and are lowered by this
17// pass.
18//
19// The only instructions that use or define vreg_1 virtual registers are COPY,
20// PHI, and IMPLICIT_DEF.
21//
22//===----------------------------------------------------------------------===//
23
24#include "SILowerI1Copies.h"
25#include "AMDGPU.h"
28
29#define DEBUG_TYPE "si-i1-copies"
30
31using namespace llvm;
32
33static Register
35 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
36
37namespace {
38
39class Vreg1LoweringHelper : public PhiLoweringHelper {
40public:
41 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
43
44private:
45 DenseSet<Register> ConstrainRegs;
46
47public:
48 void markAsLaneMask(Register DstReg) const override;
49 void getCandidatesForLowering(
50 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
51 void collectIncomingValuesFromPhi(
52 const MachineInstr *MI,
53 SmallVectorImpl<Incoming> &Incomings) const override;
54 void replaceDstReg(Register NewReg, Register OldReg,
55 MachineBasicBlock *MBB) override;
56 void buildMergeLaneMasks(MachineBasicBlock &MBB,
58 Register DstReg, Register PrevReg,
59 Register CurReg) override;
60 void constrainAsLaneMask(Incoming &In) override;
61
62 bool lowerCopiesFromI1();
63 bool lowerCopiesToI1();
64 bool cleanConstrainRegs(bool Changed);
65 bool isVreg1(Register Reg) const {
66 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
67 }
68};
69
70Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
73 : PhiLoweringHelper(MF, DT, PDT) {}
74
75bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
76 assert(Changed || ConstrainRegs.empty());
77 for (Register Reg : ConstrainRegs)
78 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
79 ConstrainRegs.clear();
80
81 return Changed;
82}
83
84/// Helper class that determines the relationship between incoming values of a
85/// phi in the control flow graph to determine where an incoming value can
86/// simply be taken as a scalar lane mask as-is, and where it needs to be
87/// merged with another, previously defined lane mask.
88///
89/// The approach is as follows:
90/// - Determine all basic blocks which, starting from the incoming blocks,
91/// a wave may reach before entering the def block (the block containing the
92/// phi).
93/// - If an incoming block has no predecessors in this set, we can take the
94/// incoming value as a scalar lane mask as-is.
95/// -- A special case of this is when the def block has a self-loop.
96/// - Otherwise, the incoming value needs to be merged with a previously
97/// defined lane mask.
98/// - If there is a path into the set of reachable blocks that does _not_ go
99/// through an incoming block where we can take the scalar lane mask as-is,
100/// we need to invent an available value for the SSAUpdater. Choices are
101/// 0 and undef, with differing consequences for how to merge values etc.
102///
103/// TODO: We could use region analysis to quickly skip over SESE regions during
104/// the traversal.
105///
106class PhiIncomingAnalysis {
107 MachinePostDominatorTree &PDT;
108 const SIInstrInfo *TII;
109
110 // For each reachable basic block, whether it is a source in the induced
111 // subgraph of the CFG.
112 MapVector<MachineBasicBlock *, bool> ReachableMap;
113 SmallVector<MachineBasicBlock *, 4> Stack;
114 SmallVector<MachineBasicBlock *, 4> Predecessors;
115
116public:
117 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII)
118 : PDT(PDT), TII(TII) {}
119
120 /// Returns whether \p MBB is a source in the induced subgraph of reachable
121 /// blocks.
122 bool isSource(MachineBasicBlock &MBB) const {
123 return ReachableMap.find(&MBB)->second;
124 }
125
126 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
127
128 void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
129 assert(Stack.empty());
130 ReachableMap.clear();
131 Predecessors.clear();
132
133 // Insert the def block first, so that it acts as an end point for the
134 // traversal.
135 ReachableMap.try_emplace(&DefBlock, false);
136
137 for (auto Incoming : Incomings) {
138 MachineBasicBlock *MBB = Incoming.Block;
139 if (MBB == &DefBlock) {
140 ReachableMap[&DefBlock] = true; // self-loop on DefBlock
141 continue;
142 }
143
144 ReachableMap.try_emplace(MBB, false);
145
146 // If this block has a divergent terminator and the def block is its
147 // post-dominator, the wave may first visit the other successors.
148 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB))
149 append_range(Stack, MBB->successors());
150 }
151
152 while (!Stack.empty()) {
153 MachineBasicBlock *MBB = Stack.pop_back_val();
154 if (ReachableMap.try_emplace(MBB, false).second)
155 append_range(Stack, MBB->successors());
156 }
157
158 for (auto &[MBB, Reachable] : ReachableMap) {
159 bool HaveReachablePred = false;
160 for (MachineBasicBlock *Pred : MBB->predecessors()) {
161 if (ReachableMap.count(Pred)) {
162 HaveReachablePred = true;
163 } else {
164 Stack.push_back(Pred);
165 }
166 }
167 if (!HaveReachablePred)
168 Reachable = true;
169 if (HaveReachablePred) {
170 for (MachineBasicBlock *UnreachablePred : Stack) {
171 if (!llvm::is_contained(Predecessors, UnreachablePred))
172 Predecessors.push_back(UnreachablePred);
173 }
174 }
175 Stack.clear();
176 }
177 }
178};
179
180/// Helper class that detects loops which require us to lower an i1 COPY into
181/// bitwise manipulation.
182///
183/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
184/// between loops with the same header. Consider this example:
185///
186/// A-+-+
187/// | | |
188/// B-+ |
189/// | |
190/// C---+
191///
192/// A is the header of a loop containing A, B, and C as far as LoopInfo is
193/// concerned. However, an i1 COPY in B that is used in C must be lowered to
194/// bitwise operations to combine results from different loop iterations when
195/// B has a divergent branch (since by default we will compile this code such
196/// that threads in a wave are merged at the entry of C).
197///
198/// The following rule is implemented to determine whether bitwise operations
199/// are required: use the bitwise lowering for a def in block B if a backward
200/// edge to B is reachable without going through the nearest common
201/// post-dominator of B and all uses of the def.
202///
203/// TODO: This rule is conservative because it does not check whether the
204/// relevant branches are actually divergent.
205///
206/// The class is designed to cache the CFG traversal so that it can be re-used
207/// for multiple defs within the same basic block.
208///
209/// TODO: We could use region analysis to quickly skip over SESE regions during
210/// the traversal.
211///
212class LoopFinder {
213 MachineDominatorTree &DT;
214 MachinePostDominatorTree &PDT;
215
216 // All visited / reachable block, tagged by level (level 0 is the def block,
217 // level 1 are all blocks reachable including but not going through the def
218 // block's IPDOM, etc.).
219 DenseMap<MachineBasicBlock *, unsigned> Visited;
220
221 // Nearest common dominator of all visited blocks by level (level 0 is the
222 // def block). Used for seeding the SSAUpdater.
223 SmallVector<MachineBasicBlock *, 4> CommonDominators;
224
225 // Post-dominator of all visited blocks.
226 MachineBasicBlock *VisitedPostDom = nullptr;
227
228 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
229 // reachable without going through the IPDOM of the def block (if the IPDOM
230 // itself has an edge to the def block, the loop level is 2), etc.
231 unsigned FoundLoopLevel = ~0u;
232
233 MachineBasicBlock *DefBlock = nullptr;
234 SmallVector<MachineBasicBlock *, 4> Stack;
235 SmallVector<MachineBasicBlock *, 4> NextLevel;
236
237public:
238 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
239 : DT(DT), PDT(PDT) {}
240
241 void initialize(MachineBasicBlock &MBB) {
242 Visited.clear();
243 CommonDominators.clear();
244 Stack.clear();
245 NextLevel.clear();
246 VisitedPostDom = nullptr;
247 FoundLoopLevel = ~0u;
248
249 DefBlock = &MBB;
250 }
251
252 /// Check whether a backward edge can be reached without going through the
253 /// given \p PostDom of the def block.
254 ///
255 /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
256 unsigned findLoop(MachineBasicBlock *PostDom) {
257 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
258
259 if (!VisitedPostDom)
260 advanceLevel();
261
262 unsigned Level = 0;
263 while (PDNode->getBlock() != PostDom) {
264 if (PDNode->getBlock() == VisitedPostDom)
265 advanceLevel();
266 PDNode = PDNode->getIDom();
267 Level++;
268 if (FoundLoopLevel == Level)
269 return Level;
270 }
271
272 return 0;
273 }
274
275 /// Add undef values dominating the loop and the optionally given additional
276 /// blocks, so that the SSA updater doesn't have to search all the way to the
277 /// function entry.
278 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
279 MachineRegisterInfo &MRI,
280 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
281 ArrayRef<Incoming> Incomings = {}) {
282 assert(LoopLevel < CommonDominators.size());
283
284 MachineBasicBlock *Dom = CommonDominators[LoopLevel];
285 for (auto &Incoming : Incomings)
286 Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
287
288 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
289 SSAUpdater.AddAvailableValue(
290 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
291 } else {
292 // The dominator is part of the loop or the given blocks, so add the
293 // undef value to unreachable predecessors instead.
294 for (MachineBasicBlock *Pred : Dom->predecessors()) {
295 if (!inLoopLevel(*Pred, LoopLevel, Incomings))
296 SSAUpdater.AddAvailableValue(
297 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
298 }
299 }
300 }
301
302private:
303 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
304 ArrayRef<Incoming> Incomings) const {
305 auto DomIt = Visited.find(&MBB);
306 if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
307 return true;
308
309 for (auto &Incoming : Incomings)
310 if (Incoming.Block == &MBB)
311 return true;
312
313 return false;
314 }
315
316 void advanceLevel() {
317 MachineBasicBlock *VisitedDom;
318
319 if (!VisitedPostDom) {
320 VisitedPostDom = DefBlock;
321 VisitedDom = DefBlock;
322 Stack.push_back(DefBlock);
323 } else {
324 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
325 VisitedDom = CommonDominators.back();
326
327 for (unsigned i = 0; i < NextLevel.size();) {
328 if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
329 Stack.push_back(NextLevel[i]);
330
331 NextLevel[i] = NextLevel.back();
332 NextLevel.pop_back();
333 } else {
334 i++;
335 }
336 }
337 }
338
339 unsigned Level = CommonDominators.size();
340 while (!Stack.empty()) {
341 MachineBasicBlock *MBB = Stack.pop_back_val();
342 if (!PDT.dominates(VisitedPostDom, MBB))
343 NextLevel.push_back(MBB);
344
345 Visited[MBB] = Level;
346 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
347
348 for (MachineBasicBlock *Succ : MBB->successors()) {
349 if (Succ == DefBlock) {
350 if (MBB == VisitedPostDom)
351 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
352 else
353 FoundLoopLevel = std::min(FoundLoopLevel, Level);
354 continue;
355 }
356
357 if (Visited.try_emplace(Succ, ~0u).second) {
358 if (MBB == VisitedPostDom)
359 NextLevel.push_back(Succ);
360 else
361 Stack.push_back(Succ);
362 }
363 }
364 }
365
366 CommonDominators.push_back(VisitedDom);
367 }
368};
369
370} // End anonymous namespace.
371
374 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
375 return MRI->createVirtualRegister(LaneMaskRegAttrs);
376}
377
378static Register
380 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
381 MachineFunction &MF = *MBB->getParent();
382 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
383 const SIInstrInfo *TII = ST.getInstrInfo();
384 Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
385 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
386 UndefReg);
387 return UndefReg;
388}
389
390#ifndef NDEBUG
393 Register Reg) {
394 unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
395 return Size == 1 || Size == 32;
396}
397#endif
398
399bool Vreg1LoweringHelper::lowerCopiesFromI1() {
400 bool Changed = false;
401 SmallVector<MachineInstr *, 4> DeadCopies;
402
403 for (MachineBasicBlock &MBB : *MF) {
404 for (MachineInstr &MI : MBB) {
405 if (MI.getOpcode() != AMDGPU::COPY)
406 continue;
407
408 Register DstReg = MI.getOperand(0).getReg();
409 Register SrcReg = MI.getOperand(1).getReg();
410 if (!isVreg1(SrcReg))
411 continue;
412
413 if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
414 continue;
415
416 Changed = true;
417
418 // Copy into a 32-bit vector register.
419 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
420 DebugLoc DL = MI.getDebugLoc();
421
422 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
423 assert(!MI.getOperand(0).getSubReg());
424
425 ConstrainRegs.insert(SrcReg);
426 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
427 .addImm(0)
428 .addImm(0)
429 .addImm(0)
430 .addImm(-1)
431 .addReg(SrcReg);
432 DeadCopies.push_back(&MI);
433 }
434
435 for (MachineInstr *MI : DeadCopies)
436 MI->eraseFromParent();
437 DeadCopies.clear();
438 }
439 return Changed;
440}
441
445 : MF(MF), DT(DT), PDT(PDT) {
446 MRI = &MF->getRegInfo();
447
448 ST = &MF->getSubtarget<GCNSubtarget>();
449 TII = ST->getInstrInfo();
450 IsWave32 = ST->isWave32();
451
452 if (IsWave32) {
453 ExecReg = AMDGPU::EXEC_LO;
454 MovOp = AMDGPU::S_MOV_B32;
455 AndOp = AMDGPU::S_AND_B32;
456 OrOp = AMDGPU::S_OR_B32;
457 XorOp = AMDGPU::S_XOR_B32;
458 AndN2Op = AMDGPU::S_ANDN2_B32;
459 OrN2Op = AMDGPU::S_ORN2_B32;
460 } else {
461 ExecReg = AMDGPU::EXEC;
462 MovOp = AMDGPU::S_MOV_B64;
463 AndOp = AMDGPU::S_AND_B64;
464 OrOp = AMDGPU::S_OR_B64;
465 XorOp = AMDGPU::S_XOR_B64;
466 AndN2Op = AMDGPU::S_ANDN2_B64;
467 OrN2Op = AMDGPU::S_ORN2_B64;
468 }
469}
470
473 LoopFinder LF(*DT, *PDT);
474 PhiIncomingAnalysis PIA(*PDT, TII);
476 SmallVector<Incoming, 4> Incomings;
477
478 getCandidatesForLowering(Vreg1Phis);
479 if (Vreg1Phis.empty())
480 return false;
481
482 DT->updateDFSNumbers();
483 MachineBasicBlock *PrevMBB = nullptr;
484 for (MachineInstr *MI : Vreg1Phis) {
485 MachineBasicBlock &MBB = *MI->getParent();
486 if (&MBB != PrevMBB) {
487 LF.initialize(MBB);
488 PrevMBB = &MBB;
489 }
490
491 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
492
493 Register DstReg = MI->getOperand(0).getReg();
494 markAsLaneMask(DstReg);
496
498
499 // Sort the incomings such that incoming values that dominate other incoming
500 // values are sorted earlier. This allows us to do some amount of on-the-fly
501 // constant folding.
502 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
503 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
504 return DT->getNode(LHS.Block)->getDFSNumIn() <
505 DT->getNode(RHS.Block)->getDFSNumIn();
506 });
507
508#ifndef NDEBUG
509 PhiRegisters.insert(DstReg);
510#endif
511
512 // Phis in a loop that are observed outside the loop receive a simple but
513 // conservatively correct treatment.
514 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
515 for (MachineInstr &Use : MRI->use_instructions(DstReg))
516 DomBlocks.push_back(Use.getParent());
517
518 MachineBasicBlock *PostDomBound =
519 PDT->findNearestCommonDominator(DomBlocks);
520
521 // FIXME: This fails to find irreducible cycles. If we have a def (other
522 // than a constant) in a pair of blocks that end up looping back to each
523 // other, it will be mishandle. Due to structurization this shouldn't occur
524 // in practice.
525 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
526
527 SSAUpdater.Initialize(DstReg);
528
529 if (FoundLoopLevel) {
530 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
531 Incomings);
532
533 for (auto &Incoming : Incomings) {
536 }
537
538 for (auto &Incoming : Incomings) {
543 }
544 } else {
545 // The phi is not observed from outside a loop. Use a more accurate
546 // lowering.
547 PIA.analyze(MBB, Incomings);
548
549 for (MachineBasicBlock *MBB : PIA.predecessors())
552
553 for (auto &Incoming : Incomings) {
555 if (PIA.isSource(IMBB)) {
558 } else {
561 }
562 }
563
564 for (auto &Incoming : Incomings) {
566 continue;
567
572 }
573 }
574
576 if (NewReg != DstReg) {
577 replaceDstReg(NewReg, DstReg, &MBB);
578 MI->eraseFromParent();
579 }
580
581 Incomings.clear();
582 }
583 return true;
584}
585
586bool Vreg1LoweringHelper::lowerCopiesToI1() {
587 bool Changed = false;
589 LoopFinder LF(*DT, *PDT);
591
592 for (MachineBasicBlock &MBB : *MF) {
593 LF.initialize(MBB);
594
595 for (MachineInstr &MI : MBB) {
596 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
597 MI.getOpcode() != AMDGPU::COPY)
598 continue;
599
600 Register DstReg = MI.getOperand(0).getReg();
601 if (!isVreg1(DstReg))
602 continue;
603
604 Changed = true;
605
606 if (MRI->use_empty(DstReg)) {
607 DeadCopies.push_back(&MI);
608 continue;
609 }
610
611 LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
612
613 markAsLaneMask(DstReg);
614 initializeLaneMaskRegisterAttributes(DstReg);
615
616 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
617 continue;
618
619 DebugLoc DL = MI.getDebugLoc();
620 Register SrcReg = MI.getOperand(1).getReg();
621 assert(!MI.getOperand(1).getSubReg());
622
623 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
624 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
625 Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
626 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
627 .addReg(SrcReg)
628 .addImm(0);
629 MI.getOperand(1).setReg(TmpReg);
630 SrcReg = TmpReg;
631 } else {
632 // SrcReg needs to be live beyond copy.
633 MI.getOperand(1).setIsKill(false);
634 }
635
636 // Defs in a loop that are observed outside the loop must be transformed
637 // into appropriate bit manipulation.
638 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
639 for (MachineInstr &Use : MRI->use_instructions(DstReg))
640 DomBlocks.push_back(Use.getParent());
641
642 MachineBasicBlock *PostDomBound =
643 PDT->findNearestCommonDominator(DomBlocks);
644 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
645 if (FoundLoopLevel) {
646 SSAUpdater.Initialize(DstReg);
648 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
649
650 buildMergeLaneMasks(MBB, MI, DL, DstReg,
652 DeadCopies.push_back(&MI);
653 }
654 }
655
656 for (MachineInstr *MI : DeadCopies)
657 MI->eraseFromParent();
658 DeadCopies.clear();
659 }
660 return Changed;
661}
662
664 const MachineInstr *MI;
665 for (;;) {
666 MI = MRI->getUniqueVRegDef(Reg);
667 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
668 return true;
669
670 if (MI->getOpcode() != AMDGPU::COPY)
671 break;
672
673 Reg = MI->getOperand(1).getReg();
674 if (!Reg.isVirtual())
675 return false;
676 if (!isLaneMaskReg(Reg))
677 return false;
678 }
679
680 if (MI->getOpcode() != MovOp)
681 return false;
682
683 if (!MI->getOperand(1).isImm())
684 return false;
685
686 int64_t Imm = MI->getOperand(1).getImm();
687 if (Imm == 0) {
688 Val = false;
689 return true;
690 }
691 if (Imm == -1) {
692 Val = true;
693 return true;
694 }
695
696 return false;
697}
698
699static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
700 Def = false;
701 Use = false;
702
703 for (const MachineOperand &MO : MI.operands()) {
704 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
705 if (MO.isUse())
706 Use = true;
707 else
708 Def = true;
709 }
710 }
711}
712
713/// Return a point at the end of the given \p MBB to insert SALU instructions
714/// for lane mask calculation. Take terminators and SCC into account.
717 auto InsertionPt = MBB.getFirstTerminator();
718 bool TerminatorsUseSCC = false;
719 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
720 bool DefsSCC;
721 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
722 if (TerminatorsUseSCC || DefsSCC)
723 break;
724 }
725
726 if (!TerminatorsUseSCC)
727 return InsertionPt;
728
729 while (InsertionPt != MBB.begin()) {
730 InsertionPt--;
731
732 bool DefSCC, UseSCC;
733 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
734 if (DefSCC)
735 return InsertionPt;
736 }
737
738 // We should have at least seen an IMPLICIT_DEF or COPY
739 llvm_unreachable("SCC used by terminator but no def in block");
740}
741
742// VReg_1 -> SReg_32 or SReg_64
743void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
744 MRI->setRegClass(DstReg, ST->getBoolRC());
745}
746
747void Vreg1LoweringHelper::getCandidatesForLowering(
748 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
749 for (MachineBasicBlock &MBB : *MF) {
750 for (MachineInstr &MI : MBB.phis()) {
751 if (isVreg1(MI.getOperand(0).getReg()))
752 Vreg1Phis.push_back(&MI);
753 }
754 }
755}
756
757void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
758 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
759 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
760 assert(i + 1 < MI->getNumOperands());
761 Register IncomingReg = MI->getOperand(i).getReg();
762 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
763 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
764
765 if (IncomingDef->getOpcode() == AMDGPU::COPY) {
766 IncomingReg = IncomingDef->getOperand(1).getReg();
767 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
768 assert(!IncomingDef->getOperand(1).getSubReg());
769 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
770 continue;
771 } else {
772 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
773 }
774
775 Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
776 }
777}
778
779void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
781 MRI->replaceRegWith(NewReg, OldReg);
782}
783
784void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
786 const DebugLoc &DL,
787 Register DstReg, Register PrevReg,
788 Register CurReg) {
789 bool PrevVal = false;
790 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
791 bool CurVal = false;
792 bool CurConstant = isConstantLaneMask(CurReg, CurVal);
793
794 if (PrevConstant && CurConstant) {
795 if (PrevVal == CurVal) {
796 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
797 } else if (CurVal) {
798 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
799 } else {
800 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
801 .addReg(ExecReg)
802 .addImm(-1);
803 }
804 return;
805 }
806
807 Register PrevMaskedReg;
808 Register CurMaskedReg;
809 if (!PrevConstant) {
810 if (CurConstant && CurVal) {
811 PrevMaskedReg = PrevReg;
812 } else {
813 PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
814 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
815 .addReg(PrevReg)
816 .addReg(ExecReg);
817 }
818 }
819 if (!CurConstant) {
820 // TODO: check whether CurReg is already masked by EXEC
821 if (PrevConstant && PrevVal) {
822 CurMaskedReg = CurReg;
823 } else {
824 CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
825 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
826 .addReg(CurReg)
827 .addReg(ExecReg);
828 }
829 }
830
831 if (PrevConstant && !PrevVal) {
832 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
833 .addReg(CurMaskedReg);
834 } else if (CurConstant && !CurVal) {
835 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
836 .addReg(PrevMaskedReg);
837 } else if (PrevConstant && PrevVal) {
838 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
839 .addReg(CurMaskedReg)
840 .addReg(ExecReg);
841 } else {
842 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
843 .addReg(PrevMaskedReg)
844 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
845 }
846}
847
848void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
849
850/// Lower all instructions that def or use vreg_1 registers.
851///
852/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
853/// occur around inline assembly. We do this first, before vreg_1 registers
854/// are changed to scalar mask registers.
855///
856/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
857/// all others, because phi lowering looks through copies and can therefore
858/// often make copy lowering unnecessary.
861 // Only need to run this in SelectionDAG path.
862 if (MF.getProperties().hasSelected())
863 return false;
864
865 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT);
866 bool Changed = false;
867 Changed |= Helper.lowerCopiesFromI1();
868 Changed |= Helper.lowerPhis();
869 Changed |= Helper.lowerCopiesToI1();
870 return Helper.cleanConstrainRegs(Changed);
871}
872
879 bool Changed = runFixI1Copies(MF, MDT, MPDT);
880 if (!Changed)
881 return PreservedAnalyses::all();
882
883 // TODO: Probably preserves most.
886 return PA;
887}
888
890public:
891 static char ID;
892
896
897 bool runOnMachineFunction(MachineFunction &MF) override;
898
899 StringRef getPassName() const override { return "SI Lower i1 Copies"; }
900
907};
908
916
918 false, false)
923
924char SILowerI1CopiesLegacy::ID = 0;
925
927
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use)
static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, MachinePostDominatorTree &MPDT)
Lower all instructions that def or use vreg_1 registers.
static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, Register Reg)
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
#define LLVM_DEBUG(...)
Definition Debug.h:119
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
iterator end()
Definition DenseMap.h:81
Implements a dense probed hash-table based set.
Definition DenseSet.h:261
DomTreeNodeBase * getIDom() const
NodeT * getBlock() const
NodeT * findNearestCommonDominator(NodeT *A, NodeT *B) const
Find nearest common dominator basic block for basic block A and B.
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
LLVM_ABI MachineBasicBlock * findNearestCommonDominator(ArrayRef< MachineBasicBlock * > Blocks) const
Returns the nearest common dominator of the given blocks.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
void AddAvailableValue(MachineBasicBlock *BB, Register V)
AddAvailableValue - Indicate that a rewritten value is available at the end of the specified block wi...
iterator find(const KeyT &Key)
Definition MapVector.h:141
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:107
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT)
bool isLaneMaskReg(Register Reg) const
MachineRegisterInfo * MRI
MachineDominatorTree * DT
DenseSet< Register > PhiRegisters
virtual void getCandidatesForLowering(SmallVectorImpl< MachineInstr * > &Vreg1Phis) const =0
virtual void constrainAsLaneMask(Incoming &In)=0
virtual void collectIncomingValuesFromPhi(const MachineInstr *MI, SmallVectorImpl< Incoming > &Incomings) const =0
virtual void markAsLaneMask(Register DstReg) const =0
MachinePostDominatorTree * PDT
const GCNSubtarget * ST
const SIInstrInfo * TII
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs
MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const
Return a point at the end of the given MBB to insert SALU instructions for lane mask calculation.
void initializeLaneMaskRegisterAttributes(Register LaneMask)
bool isConstantLaneMask(Register Reg, bool &Val) const
virtual void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg)=0
virtual void replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB)=0
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1652
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
DomTreeNodeBase< MachineBasicBlock > MachineDomTreeNode
void initializeSILowerI1CopiesLegacyPass(PassRegistry &)
ArrayRef(const T &OneElt) -> ArrayRef< T >
FunctionPass * createSILowerI1CopiesLegacyPass()
char & SILowerI1CopiesLegacyID
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
MachineBasicBlock * Block
All attributes(register class or bank and low-level type) a virtual register can have.