LLVM 22.0.0git
AMDGPUGlobalISelDivergenceLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
11/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
12/// Handles all cases of temporal divergence.
13/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
14/// currently depends on LCSSA to insert phis with one incoming.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
20#include "SILowerI1Copies.h"
25
26#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
27
28using namespace llvm;
29
30namespace {
31
32class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
33public:
34 static char ID;
35
36public:
37 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {}
38
39 bool runOnMachineFunction(MachineFunction &MF) override;
40
41 StringRef getPassName() const override {
42 return "AMDGPU GlobalISel divergence lowering";
43 }
44
45 void getAnalysisUsage(AnalysisUsage &AU) const override {
46 AU.setPreservesCFG();
51 }
52};
53
54class DivergenceLoweringHelper : public PhiLoweringHelper {
55public:
56 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
59
60private:
61 MachineUniformityInfo *MUI = nullptr;
63 Register buildRegCopyToLaneMask(Register Reg);
64
65public:
66 void markAsLaneMask(Register DstReg) const override;
67 void getCandidatesForLowering(
68 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
69 void collectIncomingValuesFromPhi(
70 const MachineInstr *MI,
71 SmallVectorImpl<Incoming> &Incomings) const override;
72 void replaceDstReg(Register NewReg, Register OldReg,
73 MachineBasicBlock *MBB) override;
74 void buildMergeLaneMasks(MachineBasicBlock &MBB,
76 Register DstReg, Register PrevReg,
77 Register CurReg) override;
78 void constrainAsLaneMask(Incoming &In) override;
79
80 bool lowerTemporalDivergence();
81 bool lowerTemporalDivergenceI1();
82};
83
84DivergenceLoweringHelper::DivergenceLoweringHelper(
87 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
88
89// _(s1) -> SReg_32/64(s1)
90void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
91 assert(MRI->getType(DstReg) == LLT::scalar(1));
92
93 if (MRI->getRegClassOrNull(DstReg)) {
94 if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
95 return;
96 llvm_unreachable("Failed to constrain register class");
97 }
98
99 MRI->setRegClass(DstReg, ST->getBoolRC());
100}
101
102void DivergenceLoweringHelper::getCandidatesForLowering(
103 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
104 LLT S1 = LLT::scalar(1);
105
106 // Add divergent i1 phis to the list
107 for (MachineBasicBlock &MBB : *MF) {
108 for (MachineInstr &MI : MBB.phis()) {
109 Register Dst = MI.getOperand(0).getReg();
110 if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
111 Vreg1Phis.push_back(&MI);
112 }
113 }
114}
115
116void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
117 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
118 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
119 Incomings.emplace_back(MI->getOperand(i).getReg(),
120 MI->getOperand(i + 1).getMBB(), Register());
121 }
122}
123
124void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
126 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
127 .addReg(NewReg);
128}
129
130// Copy Reg to new lane mask register, insert a copy after instruction that
131// defines Reg while skipping phis if needed.
132Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
133 Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
134 MachineInstr *Instr = MRI->getVRegDef(Reg);
135 MachineBasicBlock *MBB = Instr->getParent();
136 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
137 B.buildCopy(LaneMask, Reg);
138 return LaneMask;
139}
140
141// bb.previous
142// %PrevReg = ...
143//
144// bb.current
145// %CurReg = ...
146//
147// %DstReg - not defined
148//
149// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
150//
151// bb.previous
152// %PrevReg = ...
153// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
154//
155// bb.current
156// %CurReg = ...
157// %CurRegCopy:sreg_32(s1) = COPY %CurReg
158// ...
159// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
160// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
161// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
162//
163// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
164void DivergenceLoweringHelper::buildMergeLaneMasks(
166 Register DstReg, Register PrevReg, Register CurReg) {
167 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
168 // TODO: check if inputs are constants or results of a compare.
169
170 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
171 Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
172 Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
173 Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
174
175 B.setInsertPt(MBB, I);
176 B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
177 B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
178 B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
179}
180
181// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
182// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
183// Incoming.Reg becomes that new lane mask.
184void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
185 B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
186
187 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
188 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
189 In.Reg = Copy.getReg(0);
190}
191
192void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
193 Register NewReg) {
194 for (MachineOperand &Op : Inst->operands()) {
195 if (Op.isReg() && Op.getReg() == Reg)
196 Op.setReg(NewReg);
197 }
198}
199
200bool DivergenceLoweringHelper::lowerTemporalDivergence() {
203
204 for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
205 if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
206 ILMA.isS32S64LaneMask(Reg))
207 continue;
208
209 Register CachedTDCopy = TDCache.lookup(Reg);
210 if (CachedTDCopy) {
211 replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
212 continue;
213 }
214
215 MachineInstr *Inst = MRI->getVRegDef(Reg);
217 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
218
219 Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
220 B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
221 .addUse(ExecReg, RegState::Implicit);
222
223 replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
224 TDCache[Reg] = VgprReg;
225 }
226 return false;
227}
228
229bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
230 MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
231 initializeLaneMaskRegisterAttributes(BoolS1);
233
234 // In case of use outside muliple nested cycles or muliple uses we only need
235 // to merge lane mask across largest relevant cycle.
237 for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
238 if (MRI->getType(Reg) != LLT::scalar(1))
239 continue;
240
241 auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg);
242 auto &CycleMergedMask = LRCCacheIter->getSecond();
243 const MachineCycle *&CachedLRC = CycleMergedMask.first;
244 if (RegNotCached || LRC->contains(CachedLRC)) {
245 CachedLRC = LRC;
246 }
247 }
248
249 for (auto &LRCCacheEntry : LRCCache) {
250 Register Reg = LRCCacheEntry.first;
251 auto &CycleMergedMask = LRCCacheEntry.getSecond();
252 const MachineCycle *Cycle = CycleMergedMask.first;
253
254 Register MergedMask = MRI->createVirtualRegister(BoolS1);
255 SSAUpdater.Initialize(MergedMask);
256
257 MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
258 SSAUpdater.AddAvailableValue(MBB, MergedMask);
259
260 for (auto Entry : Cycle->getEntries()) {
261 for (MachineBasicBlock *Pred : Entry->predecessors()) {
262 if (!Cycle->contains(Pred)) {
263 B.setInsertPt(*Pred, Pred->getFirstTerminator());
264 auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
265 SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
266 }
267 }
268 }
269
270 buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
272
273 CycleMergedMask.second = MergedMask;
274 }
275
276 for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
277 if (MRI->getType(Reg) != LLT::scalar(1))
278 continue;
279
280 replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second);
281 }
282
283 return false;
284}
285
286} // End anonymous namespace.
287
288INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
289 "AMDGPU GlobalISel divergence lowering", false, false)
293INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
294 "AMDGPU GlobalISel divergence lowering", false, false)
295
296char AMDGPUGlobalISelDivergenceLowering::ID = 0;
297
299 AMDGPUGlobalISelDivergenceLowering::ID;
300
302 return new AMDGPUGlobalISelDivergenceLowering();
303}
304
305bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
306 MachineFunction &MF) {
308 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
310 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
312 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
313
314 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
315
316 bool Changed = false;
317 // Temporal divergence lowering needs to inspect list of instructions used
318 // outside cycle with divergent exit provided by uniformity analysis. Uniform
319 // instructions from the list require lowering, no instruction is deleted.
320 // Thus it needs to be run before lowerPhis that deletes phis that require
321 // lowering and replaces them with new instructions.
322
323 // Non-i1 temporal divergence lowering.
324 Changed |= Helper.lowerTemporalDivergence();
325 // This covers both uniform and divergent i1s. Lane masks are in sgpr and need
326 // to be updated in each iteration.
327 Changed |= Helper.lowerTemporalDivergenceI1();
328 // Temporal divergence lowering of divergent i1 phi used outside of the cycle
329 // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
330 // since in some case lowerPhis does unnecessary lane mask merging.
331 Changed |= Helper.lowerPhis();
332 return Changed;
333}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define DEBUG_TYPE
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
This file declares the MachineIRBuilder class.
Register Reg
Machine IR instance of the generic uniformity analysis.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
A debug info location.
Definition DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
iterator_range< TemporalDivergenceTuple * > getTemporalDivergenceList() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Helper class to build MachineInstr.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
mop_range operands()
MachineOperand class - Representation of each machine instruction operand.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
Legacy analysis pass which computes a MachineUniformityInfo.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
self_iterator getIterator()
Definition ilist_node.h:134
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Implicit
Not emitted register (e.g. carry, or temporary result).
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
DWARFExpression::Operation Op
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
MachineCycleInfo::CycleT MachineCycle
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
All attributes(register class or bank and low-level type) a virtual register can have.