LLVM 22.0.0git
SIPreEmitPeephole.cpp
Go to the documentation of this file.
1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
13/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
14/// co-issued. This helps with overlapping MFMA and certain vector instructions
15/// in machine schedules and is expected to improve performance. Only those
16/// packed instructions are unpacked that are overlapped by the MFMA latency.
17/// Rest should remain untouched.
18/// TODO: Add support for F16 packed instructions
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "GCNSubtarget.h"
24#include "llvm/ADT/SetVector.h"
28
29using namespace llvm;
30
31#define DEBUG_TYPE "si-pre-emit-peephole"
32
33namespace {
34
35class SIPreEmitPeephole {
36private:
37 const SIInstrInfo *TII = nullptr;
38 const SIRegisterInfo *TRI = nullptr;
39
40 bool optimizeVccBranch(MachineInstr &MI) const;
41 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
42 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
43 MachineBasicBlock *&TrueMBB,
44 MachineBasicBlock *&FalseMBB,
46 bool mustRetainExeczBranch(const MachineInstr &Branch,
47 const MachineBasicBlock &From,
48 const MachineBasicBlock &To) const;
49 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
50 // Check if the machine instruction being processed is a supported packed
51 // instruction.
52 bool isUnpackingSupportedInstr(MachineInstr &MI) const;
53 // Creates a list of packed instructions following an MFMA that are suitable
54 // for unpacking.
55 void collectUnpackingCandidates(MachineInstr &BeginMI,
56 SetVector<MachineInstr *> &InstrsToUnpack,
57 uint16_t NumMFMACycles);
58 // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
59 // op_sel_hi:[0,0,0]
60 // ==>
61 // v_fma_f32 v0, v1, v3, v3
62 // v_fma_f32 v1, v0, v2, v2
63 // Here, we have overwritten v0 before we use it. This function checks if
64 // unpacking can lead to such a situation.
65 bool canUnpackingClobberRegister(const MachineInstr &MI);
66 // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
67 // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
68 // this transformation.
69 void performF32Unpacking(MachineInstr &I);
70 // Select corresponding unpacked instruction
71 uint16_t mapToUnpackedOpcode(MachineInstr &I);
72 // Creates the unpacked instruction to be inserted. Adds source modifiers to
73 // the unpacked instructions based on the source modifiers in the packed
74 // instruction.
75 MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
76 bool IsHiBits);
77 // Process operands/source modifiers from packed instructions and insert the
78 // appropriate source modifers and operands into the unpacked instructions.
79 void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
80 bool IsHiBits, const MachineOperand &SrcMO);
81
82public:
83 bool run(MachineFunction &MF);
84};
85
86class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
87public:
88 static char ID;
89
90 SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
92 }
93
94 bool runOnMachineFunction(MachineFunction &MF) override {
95 return SIPreEmitPeephole().run(MF);
96 }
97};
98
99} // End anonymous namespace.
100
101INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
102 "SI peephole optimizations", false, false)
103
104char SIPreEmitPeepholeLegacy::ID = 0;
105
106char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
107
108bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
109 // Match:
110 // sreg = -1 or 0
111 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
112 // S_CBRANCH_VCC[N]Z
113 // =>
114 // S_CBRANCH_EXEC[N]Z
115 // We end up with this pattern sometimes after basic block placement.
116 // It happens while combining a block which assigns -1 or 0 to a saved mask
117 // and another block which consumes that saved mask and then a branch.
118 //
119 // While searching this also performs the following substitution:
120 // vcc = V_CMP
121 // vcc = S_AND exec, vcc
122 // S_CBRANCH_VCC[N]Z
123 // =>
124 // vcc = V_CMP
125 // S_CBRANCH_VCC[N]Z
126
127 bool Changed = false;
128 MachineBasicBlock &MBB = *MI.getParent();
129 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
130 const bool IsWave32 = ST.isWave32();
131 const unsigned CondReg = TRI->getVCC();
132 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
133 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
134 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
135 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
136
137 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
138 E = MBB.rend();
139 bool ReadsCond = false;
140 unsigned Threshold = 5;
141 for (++A; A != E; ++A) {
142 if (!--Threshold)
143 return false;
144 if (A->modifiesRegister(ExecReg, TRI))
145 return false;
146 if (A->modifiesRegister(CondReg, TRI)) {
147 if (!A->definesRegister(CondReg, TRI) ||
148 (A->getOpcode() != And && A->getOpcode() != AndN2))
149 return false;
150 break;
151 }
152 ReadsCond |= A->readsRegister(CondReg, TRI);
153 }
154 if (A == E)
155 return false;
156
157 MachineOperand &Op1 = A->getOperand(1);
158 MachineOperand &Op2 = A->getOperand(2);
159 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
160 TII->commuteInstruction(*A);
161 Changed = true;
162 }
163 if (Op1.getReg() != ExecReg)
164 return Changed;
165 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
166 return Changed;
167
168 int64_t MaskValue = 0;
169 Register SReg;
170 if (Op2.isReg()) {
171 SReg = Op2.getReg();
172 auto M = std::next(A);
173 bool ReadsSreg = false;
174 bool ModifiesExec = false;
175 for (; M != E; ++M) {
176 if (M->definesRegister(SReg, TRI))
177 break;
178 if (M->modifiesRegister(SReg, TRI))
179 return Changed;
180 ReadsSreg |= M->readsRegister(SReg, TRI);
181 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
182 }
183 if (M == E)
184 return Changed;
185 // If SReg is VCC and SReg definition is a VALU comparison.
186 // This means S_AND with EXEC is not required.
187 // Erase the S_AND and return.
188 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
189 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
190 TII->isVOPC(*M)) {
191 A->eraseFromParent();
192 return true;
193 }
194 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
195 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
196 return Changed;
197 MaskValue = M->getOperand(1).getImm();
198 // First if sreg is only used in the AND instruction fold the immediate
199 // into the AND.
200 if (!ReadsSreg && Op2.isKill()) {
201 A->getOperand(2).ChangeToImmediate(MaskValue);
202 M->eraseFromParent();
203 }
204 } else if (Op2.isImm()) {
205 MaskValue = Op2.getImm();
206 } else {
207 llvm_unreachable("Op2 must be register or immediate");
208 }
209
210 // Invert mask for s_andn2
211 assert(MaskValue == 0 || MaskValue == -1);
212 if (A->getOpcode() == AndN2)
213 MaskValue = ~MaskValue;
214
215 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
216 if (!MI.killsRegister(CondReg, TRI)) {
217 // Replace AND with MOV
218 if (MaskValue == 0) {
219 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
220 .addImm(0);
221 } else {
222 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
223 .addReg(ExecReg);
224 }
225 }
226 // Remove AND instruction
227 A->eraseFromParent();
228 }
229
230 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
231 if (SReg == ExecReg) {
232 // EXEC is updated directly
233 if (IsVCCZ) {
234 MI.eraseFromParent();
235 return true;
236 }
237 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
238 } else if (IsVCCZ && MaskValue == 0) {
239 // Will always branch
240 // Remove all successors shadowed by new unconditional branch
241 MachineBasicBlock *Parent = MI.getParent();
243 bool Found = false;
244 for (MachineInstr &Term : Parent->terminators()) {
245 if (Found) {
246 if (Term.isBranch())
247 ToRemove.push_back(&Term);
248 } else {
249 Found = Term.isIdenticalTo(MI);
250 }
251 }
252 assert(Found && "conditional branch is not terminator");
253 for (auto *BranchMI : ToRemove) {
254 MachineOperand &Dst = BranchMI->getOperand(0);
255 assert(Dst.isMBB() && "destination is not basic block");
256 Parent->removeSuccessor(Dst.getMBB());
257 BranchMI->eraseFromParent();
258 }
259
260 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
261 Parent->removeSuccessor(Succ);
262 }
263
264 // Rewrite to unconditional branch
265 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
266 } else if (!IsVCCZ && MaskValue == 0) {
267 // Will never branch
268 MachineOperand &Dst = MI.getOperand(0);
269 assert(Dst.isMBB() && "destination is not basic block");
270 MI.getParent()->removeSuccessor(Dst.getMBB());
271 MI.eraseFromParent();
272 return true;
273 } else if (MaskValue == -1) {
274 // Depends only on EXEC
275 MI.setDesc(
276 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
277 }
278
279 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
280 MI.addImplicitDefUseOperands(*MBB.getParent());
281
282 return true;
283}
284
285bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
286 MachineInstr &MI) const {
287 MachineBasicBlock &MBB = *MI.getParent();
288 const MachineFunction &MF = *MBB.getParent();
289 const MachineRegisterInfo &MRI = MF.getRegInfo();
290 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
291 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
292 SmallVector<MachineInstr *, 4> ToRemove;
293 bool IdxOn = true;
294
295 if (!MI.isIdenticalTo(First))
296 return false;
297
298 // Scan back to find an identical S_SET_GPR_IDX_ON
299 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
300 E = MI.getIterator();
301 I != E; ++I) {
302 if (I->isBundle())
303 continue;
304 switch (I->getOpcode()) {
305 case AMDGPU::S_SET_GPR_IDX_MODE:
306 return false;
307 case AMDGPU::S_SET_GPR_IDX_OFF:
308 IdxOn = false;
309 ToRemove.push_back(&*I);
310 break;
311 default:
312 if (I->modifiesRegister(AMDGPU::M0, TRI))
313 return false;
314 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
315 return false;
316 if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
317 return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
318 })) {
319 // The only exception allowed here is another indirect vector move
320 // with the same mode.
321 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
322 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
323 return false;
324 }
325 }
326 }
327
328 MI.eraseFromBundle();
329 for (MachineInstr *RI : ToRemove)
330 RI->eraseFromBundle();
331 return true;
332}
333
334bool SIPreEmitPeephole::getBlockDestinations(
335 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
336 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
337 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
338 return false;
339
340 if (!FalseMBB)
341 FalseMBB = SrcMBB.getNextNode();
342
343 return true;
344}
345
346namespace {
347class BranchWeightCostModel {
348 const SIInstrInfo &TII;
349 const TargetSchedModel &SchedModel;
350 BranchProbability BranchProb;
351 static constexpr uint64_t BranchNotTakenCost = 1;
352 uint64_t BranchTakenCost;
353 uint64_t ThenCyclesCost = 0;
354
355public:
356 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
357 const MachineBasicBlock &Succ)
358 : TII(TII), SchedModel(TII.getSchedModel()) {
359 const MachineBasicBlock &Head = *Branch.getParent();
360 const auto *FromIt = find(Head.successors(), &Succ);
361 assert(FromIt != Head.succ_end());
362
363 BranchProb = Head.getSuccProbability(FromIt);
364 if (BranchProb.isUnknown())
365 BranchProb = BranchProbability::getZero();
366 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
367 }
368
369 bool isProfitable(const MachineInstr &MI) {
370 if (TII.isWaitcnt(MI.getOpcode()))
371 return false;
372
373 ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
374
375 // Consider `P = N/D` to be the probability of execz being false (skipping
376 // the then-block) The transformation is profitable if always executing the
377 // 'then' block is cheaper than executing sometimes 'then' and always
378 // executing s_cbranch_execz:
379 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
380 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
381 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
382 // BranchNotTakenCost
383 uint64_t Numerator = BranchProb.getNumerator();
384 uint64_t Denominator = BranchProb.getDenominator();
385 return (Denominator - Numerator) * ThenCyclesCost <=
386 ((Denominator - Numerator) * BranchTakenCost +
387 Numerator * BranchNotTakenCost);
388 }
389};
390
391bool SIPreEmitPeephole::mustRetainExeczBranch(
392 const MachineInstr &Branch, const MachineBasicBlock &From,
393 const MachineBasicBlock &To) const {
394 assert(is_contained(Branch.getParent()->successors(), &From));
395 BranchWeightCostModel CostModel{*TII, Branch, From};
396
397 const MachineFunction *MF = From.getParent();
398 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
399 MBBI != End && MBBI != ToI; ++MBBI) {
400 const MachineBasicBlock &MBB = *MBBI;
401
402 for (const MachineInstr &MI : MBB) {
403 // When a uniform loop is inside non-uniform control flow, the branch
404 // leaving the loop might never be taken when EXEC = 0.
405 // Hence we should retain cbranch out of the loop lest it become infinite.
406 if (MI.isConditionalBranch())
407 return true;
408
409 if (MI.isUnconditionalBranch() &&
410 TII->getBranchDestBlock(MI) != MBB.getNextNode())
411 return true;
412
413 if (MI.isMetaInstruction())
414 continue;
415
416 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
417 return true;
418
419 if (!CostModel.isProfitable(MI))
420 return true;
421 }
422 }
423
424 return false;
425}
426} // namespace
427
428// Returns true if the skip branch instruction is removed.
429bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
430 MachineBasicBlock &SrcMBB) {
431
432 if (!TII->getSchedModel().hasInstrSchedModel())
433 return false;
434
435 MachineBasicBlock *TrueMBB = nullptr;
436 MachineBasicBlock *FalseMBB = nullptr;
438
439 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
440 return false;
441
442 // Consider only the forward branches.
443 if (SrcMBB.getNumber() >= TrueMBB->getNumber())
444 return false;
445
446 // Consider only when it is legal and profitable
447 if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
448 return false;
449
450 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
451 MI.eraseFromParent();
452 SrcMBB.removeSuccessor(TrueMBB);
453
454 return true;
455}
456
457// If support is extended to new operations, add tests in
458// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
459bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
460 if (!TII->isNeverCoissue(MI))
461 return false;
462 unsigned Opcode = MI.getOpcode();
463 switch (Opcode) {
464 case AMDGPU::V_PK_ADD_F32:
465 case AMDGPU::V_PK_MUL_F32:
466 case AMDGPU::V_PK_FMA_F32:
467 return true;
468 default:
469 return false;
470 }
471 llvm_unreachable("Fully covered switch");
472}
473
474bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
475 unsigned OpCode = MI.getOpcode();
476 Register DstReg = MI.getOperand(0).getReg();
477 // Only the first register in the register pair needs to be checked due to the
478 // unpacking order. Packed instructions are unpacked such that the lower 32
479 // bits (i.e., the first register in the pair) are written first. This can
480 // introduce dependencies if the first register is written in one instruction
481 // and then read as part of the higher 32 bits in the subsequent instruction.
482 // Such scenarios can arise due to specific combinations of op_sel and
483 // op_sel_hi modifiers.
484 Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
485
486 const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
487 if (Src0MO && Src0MO->isReg()) {
488 Register SrcReg0 = Src0MO->getReg();
489 unsigned Src0Mods =
490 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
491 Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
492 ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
493 : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
494 // Check if the register selected by op_sel_hi is the same as the first
495 // register in the destination register pair.
496 if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
497 return true;
498 }
499
500 const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
501 if (Src1MO && Src1MO->isReg()) {
502 Register SrcReg1 = Src1MO->getReg();
503 unsigned Src1Mods =
504 TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
505 Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
506 ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
507 : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
508 if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
509 return true;
510 }
511
512 // Applicable for packed instructions with 3 source operands, such as
513 // V_PK_FMA.
514 if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
515 const MachineOperand *Src2MO =
516 TII->getNamedOperand(MI, AMDGPU::OpName::src2);
517 if (Src2MO && Src2MO->isReg()) {
518 Register SrcReg2 = Src2MO->getReg();
519 unsigned Src2Mods =
520 TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
521 Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
522 ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
523 : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
524 if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
525 return true;
526 }
527 }
528 return false;
529}
530
531uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
532 unsigned Opcode = I.getOpcode();
533 // Use 64 bit encoding to allow use of VOP3 instructions.
534 // VOP3 e64 instructions allow source modifiers
535 // e32 instructions don't allow source modifiers.
536 switch (Opcode) {
537 case AMDGPU::V_PK_ADD_F32:
538 return AMDGPU::V_ADD_F32_e64;
539 case AMDGPU::V_PK_MUL_F32:
540 return AMDGPU::V_MUL_F32_e64;
541 case AMDGPU::V_PK_FMA_F32:
542 return AMDGPU::V_FMA_F32_e64;
543 default:
544 return std::numeric_limits<uint16_t>::max();
545 }
546 llvm_unreachable("Fully covered switch");
547}
548
549void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
550 unsigned SrcMods, bool IsHiBits,
551 const MachineOperand &SrcMO) {
552 unsigned NewSrcMods = 0;
553 unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
554 unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
555 // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
556 // for ABS modifiers.
557 // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
558 // lane.
559 // NEG_HI shares the same bit position with ABS. But packed instructions do
560 // not support ABS. Therefore, NEG_HI must be translated to NEG source
561 // modifier for the higher 32 bits. Unpacked VOP3 instructions support
562 // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
563 // NEG modifier if present in the packed instruction.
564 if (SrcMods & NegModifier)
565 NewSrcMods |= SISrcMods::NEG;
566 // Src modifiers. Only negative modifiers are added if needed. Unpacked
567 // operations do not have op_sel, therefore it must be handled explicitly as
568 // done below.
569 NewMI.addImm(NewSrcMods);
570 if (SrcMO.isImm()) {
571 NewMI.addImm(SrcMO.getImm());
572 return;
573 }
574 // If op_sel == 0, select register 0 of reg:sub0_sub1.
575 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
576 ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
577 : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
578
579 MachineOperand UnpackedSrcMO =
580 MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
581 if (SrcMO.isKill()) {
582 // For each unpacked instruction, mark its source registers as killed if the
583 // corresponding source register in the original packed instruction was
584 // marked as killed.
585 //
586 // Exception:
587 // If the op_sel and op_sel_hi modifiers require both unpacked instructions
588 // to use the same register (e.g., due to overlapping access to low/high
589 // bits of the same packed register), then only the *second* (latter)
590 // instruction should mark the register as killed. This is because the
591 // second instruction handles the higher bits and is effectively the last
592 // user of the full register pair.
593
594 bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
595 bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
596 bool KillState = true;
597 if ((OpSel == OpSelHi) && !IsHiBits)
598 KillState = false;
599 UnpackedSrcMO.setIsKill(KillState);
600 }
601 NewMI.add(UnpackedSrcMO);
602}
603
604void SIPreEmitPeephole::collectUnpackingCandidates(
605 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
606 uint16_t NumMFMACycles) {
607 auto *BB = BeginMI.getParent();
608 auto E = BB->end();
609 int TotalCyclesBetweenCandidates = 0;
610 auto SchedModel = TII->getSchedModel();
611 Register MFMADef = BeginMI.getOperand(0).getReg();
612
613 for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
614 MachineInstr &Instr = *I;
615 if (Instr.isMetaInstruction())
616 continue;
617 if ((Instr.isTerminator()) ||
618 (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
620 Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
621 return;
622
623 const MCSchedClassDesc *InstrSchedClassDesc =
624 SchedModel.resolveSchedClass(&Instr);
625 uint16_t Latency =
626 SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
627 TotalCyclesBetweenCandidates += Latency;
628
629 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
630 return;
631 // Identify register dependencies between those used by the MFMA
632 // instruction and the following packed instructions. Also checks for
633 // transitive dependencies between the MFMA def and candidate instruction
634 // def and uses. Conservatively ensures that we do not incorrectly
635 // read/write registers.
636 for (const MachineOperand &InstrMO : Instr.operands()) {
637 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
638 continue;
639 if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
640 return;
641 }
642 if (!isUnpackingSupportedInstr(Instr))
643 continue;
644
645 if (canUnpackingClobberRegister(Instr))
646 return;
647 // If it's a packed instruction, adjust latency: remove the packed
648 // latency, add latency of two unpacked instructions (currently estimated
649 // as 2 cycles).
650 TotalCyclesBetweenCandidates -= Latency;
651 // TODO: improve latency handling based on instruction modeling.
652 TotalCyclesBetweenCandidates += 2;
653 // Subtract 1 to account for MFMA issue latency.
654 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
655 InstrsToUnpack.insert(&Instr);
656 }
657 return;
658}
659
660void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
661 MachineOperand DstOp = I.getOperand(0);
662
663 uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
664 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
665 "Unsupported Opcode");
666
667 MachineInstrBuilder Op0LOp1L =
668 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
669 MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
670
671 LoDstOp.setIsUndef(DstOp.isUndef());
672
673 MachineInstrBuilder Op0HOp1H =
674 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
675 MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
676
677 if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
678 Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
679 Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
680 }
681 if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
682 Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
683 Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
684 }
685
686 LoDstOp.setIsRenamable(DstOp.isRenamable());
687 HiDstOp.setIsRenamable(DstOp.isRenamable());
688
689 I.eraseFromParent();
690 return;
691}
692
693MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
694 uint16_t UnpackedOpcode,
695 bool IsHiBits) {
696 MachineBasicBlock &MBB = *I.getParent();
697 const DebugLoc &DL = I.getDebugLoc();
698 const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
699 const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
700 Register DstReg = I.getOperand(0).getReg();
701 unsigned OpCode = I.getOpcode();
702 Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
703 : TRI->getSubReg(DstReg, AMDGPU::sub0);
704
705 int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
706 unsigned Src0Mods =
707 TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
708 unsigned Src1Mods =
709 TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
710
711 MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
712 NewMI.addDef(UnpackedDstReg); // vdst
713 addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
714 addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
715
716 if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
717 const MachineOperand *SrcMO3 =
718 TII->getNamedOperand(I, AMDGPU::OpName::src2);
719 unsigned Src2Mods =
720 TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
721 addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
722 }
723 NewMI.addImm(ClampVal); // clamp
724 // Packed instructions do not support output modifiers. safe to assign them 0
725 // for this use case
726 NewMI.addImm(0); // omod
727 return NewMI;
728}
729
730PreservedAnalyses
738
739bool SIPreEmitPeephole::run(MachineFunction &MF) {
740 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
741 TII = ST.getInstrInfo();
742 TRI = &TII->getRegisterInfo();
743 bool Changed = false;
744
745 MF.RenumberBlocks();
746
747 for (MachineBasicBlock &MBB : MF) {
748 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
749 // Check first terminator for branches to optimize
750 if (TermI != MBB.end()) {
751 MachineInstr &MI = *TermI;
752 switch (MI.getOpcode()) {
753 case AMDGPU::S_CBRANCH_VCCZ:
754 case AMDGPU::S_CBRANCH_VCCNZ:
755 Changed |= optimizeVccBranch(MI);
756 break;
757 case AMDGPU::S_CBRANCH_EXECZ:
758 Changed |= removeExeczBranch(MI, MBB);
759 break;
760 }
761 }
762
763 if (!ST.hasVGPRIndexMode())
764 continue;
765
766 MachineInstr *SetGPRMI = nullptr;
767 const unsigned Threshold = 20;
768 unsigned Count = 0;
769 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
770 // second is not needed. Do expensive checks in the optimizeSetGPR()
771 // and limit the distance to 20 instructions for compile time purposes.
772 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
773 // may be bundled with the instructions they modify.
774 for (auto &MI : make_early_inc_range(MBB.instrs())) {
775 if (Count == Threshold)
776 SetGPRMI = nullptr;
777 else
778 ++Count;
779
780 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
781 continue;
782
783 Count = 0;
784 if (!SetGPRMI) {
785 SetGPRMI = &MI;
786 continue;
787 }
788
789 if (optimizeSetGPR(*SetGPRMI, MI))
790 Changed = true;
791 else
792 SetGPRMI = &MI;
793 }
794 }
795
796 // TODO: Fold this into previous block, if possible. Evaluate and handle any
797 // side effects.
798 for (MachineBasicBlock &MBB : MF) {
799 // Unpack packed instructions overlapped by MFMAs. This allows the compiler
800 // to co-issue unpacked instructions with MFMA
801 auto SchedModel = TII->getSchedModel();
802 SetVector<MachineInstr *> InstrsToUnpack;
803 for (auto &MI : make_early_inc_range(MBB.instrs())) {
805 continue;
806 const MCSchedClassDesc *SchedClassDesc =
807 SchedModel.resolveSchedClass(&MI);
808 uint16_t NumMFMACycles =
809 SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
810 collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
811 }
812 for (MachineInstr *MI : InstrsToUnpack) {
813 performF32Unpacking(*MI);
814 }
815 }
816
817 return Changed;
818}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
LLVM_ABI BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
void setFlag(MIFlag Flag)
Set a MI flag.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
LLVM_ABI bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static bool isMFMA(const MachineInstr &MI)
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
Definition SetVector.h:59
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
self_iterator getIterator()
Definition ilist_node.h:130
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:355
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition MCSchedule.h:73
Matching combinators.