LLVM 22.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
246 bool tryConstantFoldOp(MachineInstr *MI) const;
247 bool tryFoldCndMask(MachineInstr &MI) const;
248 bool tryFoldZeroHighBits(MachineInstr &MI) const;
249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
250
251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
252 bool tryFoldFoldableCopy(MachineInstr &MI,
253 MachineOperand *&CurrentKnownM0Val) const;
254
255 const MachineOperand *isClamp(const MachineInstr &MI) const;
256 bool tryFoldClamp(MachineInstr &MI);
257
258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
259 bool tryFoldOMod(MachineInstr &MI);
260 bool tryFoldRegSequence(MachineInstr &MI);
261 bool tryFoldPhiAGPR(MachineInstr &MI);
262 bool tryFoldLoad(MachineInstr &MI);
263
264 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
265
266public:
267 SIFoldOperandsImpl() = default;
268
269 bool run(MachineFunction &MF);
270};
271
272class SIFoldOperandsLegacy : public MachineFunctionPass {
273public:
274 static char ID;
275
276 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
277
278 bool runOnMachineFunction(MachineFunction &MF) override {
279 if (skipFunction(MF.getFunction()))
280 return false;
281 return SIFoldOperandsImpl().run(MF);
282 }
283
284 StringRef getPassName() const override { return "SI Fold Operands"; }
285
286 void getAnalysisUsage(AnalysisUsage &AU) const override {
287 AU.setPreservesCFG();
289 }
290
291 MachineFunctionProperties getRequiredProperties() const override {
292 return MachineFunctionProperties().setIsSSA();
293 }
294};
295
296} // End anonymous namespace.
297
298INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
299 false)
300
301char SIFoldOperandsLegacy::ID = 0;
302
303char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
304
307 const MachineOperand &MO) {
308 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
309 if (const TargetRegisterClass *SubRC =
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
311 RC = SubRC;
312 return RC;
313}
314
315// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
316static unsigned macToMad(unsigned Opc) {
317 switch (Opc) {
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
334 }
335 return AMDGPU::INSTRUCTION_LIST_END;
336}
337
338// TODO: Add heuristic that the frame index might not fit in the addressing mode
339// immediate offset to avoid materializing in loops.
340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
341 const FoldableDef &OpToFold) const {
342 if (!OpToFold.isFI())
343 return false;
344
345 const unsigned Opc = UseMI.getOpcode();
346 switch (Opc) {
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
351 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
352 // to insert the wave size shift at every point we use the index.
353 // TODO: Fix depending on visit order to fold immediates into the operand
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
360 default:
361 break;
362 }
363
364 if (TII->isMUBUF(UseMI))
365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
366 if (!TII->isFLATScratch(UseMI))
367 return false;
368
369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
370 if (OpNo == SIdx)
371 return true;
372
373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
375}
376
377/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
378///
379/// => %vgpr = V_ADD_U32 x, frameindex
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
381 Register DstReg, Register SrcReg, MachineInstr &MI) const {
382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *Def = MRI->getVRegDef(SrcReg);
385 if (!Def || Def->getNumOperands() != 4)
386 return false;
387
388 MachineOperand *Src0 = &Def->getOperand(1);
389 MachineOperand *Src1 = &Def->getOperand(2);
390
391 // TODO: This is profitable with more operand types, and for more
392 // opcodes. But ultimately this is working around poor / nonexistent
393 // regbankselect.
394 if (!Src0->isFI() && !Src1->isFI())
395 return false;
396
397 if (Src0->isFI())
398 std::swap(Src0, Src1);
399
400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !Def->getOperand(3).isDead()) // Check if scc is dead
404 return false;
405
406 MachineBasicBlock *MBB = Def->getParent();
407 const DebugLoc &DL = Def->getDebugLoc();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder Add =
410 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
411
412 if (Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
414 Add.addDef(CarryOutReg, RegState::Dead);
415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
416 }
417
418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
419 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
420 Add.addImm(0);
421
422 Def->eraseFromParent();
423 MI.eraseFromParent();
424 return true;
425 }
426
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
428
430 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
431 if (Liveness == MachineBasicBlock::LQR_Dead) {
432 // TODO: If src1 satisfies operand constraints, use vop3 version.
433 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
434 .add(*Src0)
435 .add(*Src1)
436 .setOperandDead(3) // implicit-def $vcc
437 .setMIFlags(Def->getFlags());
438 Def->eraseFromParent();
439 MI.eraseFromParent();
440 return true;
441 }
442 }
443
444 return false;
445}
446
448 return new SIFoldOperandsLegacy();
449}
450
451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
452 unsigned UseOpNo,
453 int64_t ImmVal) const {
454 const uint64_t TSFlags = MI->getDesc().TSFlags;
455
456 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
457 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
458 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
459 return false;
460
461 const MachineOperand &Old = MI->getOperand(UseOpNo);
462 int OpNo = MI->getOperandNo(&Old);
463
464 unsigned Opcode = MI->getOpcode();
465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
466 switch (OpType) {
467 default:
468 return false;
476 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
477 // two different constants.
478 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
480 return false;
481 break;
482 }
483
484 return true;
485}
486
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
488 int64_t ImmVal) const {
489 MachineOperand &Old = MI->getOperand(UseOpNo);
490 unsigned Opcode = MI->getOpcode();
491 int OpNo = MI->getOperandNo(&Old);
492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
493
494 // If the literal can be inlined as-is, apply it and short-circuit the
495 // tests below. The main motivation for this is to avoid unintuitive
496 // uses of opsel.
497 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
498 Old.ChangeToImmediate(ImmVal);
499 return true;
500 }
501
502 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
503 // op_sel in a way that allows an inline constant.
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
508 SrcIdx = 0;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
511 SrcIdx = 1;
512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
514 SrcIdx = 2;
515 }
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &Mod = MI->getOperand(ModIdx);
519 unsigned ModVal = Mod.getImm();
520
521 uint16_t ImmLo =
522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
523 uint16_t ImmHi =
524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
526 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
527
528 // Helper function that attempts to inline the given value with a newly
529 // chosen opsel pattern.
530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
531 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
532 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
533 Old.ChangeToImmediate(Imm);
534 return true;
535 }
536
537 // Try to shuffle the halves around and leverage opsel to get an inline
538 // constant.
539 uint16_t Lo = static_cast<uint16_t>(Imm);
540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
541 if (Lo == Hi) {
542 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
543 Mod.setImm(NewModVal);
545 return true;
546 }
547
548 if (static_cast<int16_t>(Lo) < 0) {
549 int32_t SExt = static_cast<int16_t>(Lo);
550 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
551 Mod.setImm(NewModVal);
552 Old.ChangeToImmediate(SExt);
553 return true;
554 }
555 }
556
557 // This check is only useful for integer instructions
558 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
559 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
560 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
561 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
562 return true;
563 }
564 }
565 } else {
566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
567 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
568 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
569 Old.ChangeToImmediate(Swapped);
570 return true;
571 }
572 }
573
574 return false;
575 };
576
577 if (tryFoldToInline(Imm))
578 return true;
579
580 // Replace integer addition by subtraction and vice versa if it allows
581 // folding the immediate to an inline constant.
582 //
583 // We should only ever get here for SrcIdx == 1 due to canonicalization
584 // earlier in the pipeline, but we double-check here to be safe / fully
585 // general.
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 unsigned ClampIdx =
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
592
593 if (!Clamp) {
594 uint16_t NegLo = -static_cast<uint16_t>(Imm);
595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
597
598 if (tryFoldToInline(NegImm)) {
599 unsigned NegOpcode =
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(TII->get(NegOpcode));
602 return true;
603 }
604 }
605 }
606
607 return false;
608}
609
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
611 MachineInstr *MI = Fold.UseMI;
612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
613 assert(Old.isReg());
614
615 std::optional<int64_t> ImmVal;
616 if (Fold.isImm())
617 ImmVal = Fold.Def.getEffectiveImmVal();
618
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
621 return true;
622
623 // We can't represent the candidate as an inline constant. Try as a literal
624 // with the original opsel, checking constant bus limitations.
625 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
626 int OpNo = MI->getOperandNo(&Old);
627 if (!TII->isOperandLegal(*MI, OpNo, &New))
628 return false;
629 Old.ChangeToImmediate(*ImmVal);
630 return true;
631 }
632
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *MBB = MI->getParent();
635 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
636 if (Liveness != MachineBasicBlock::LQR_Dead) {
637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
638 return false;
639 }
640
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 = MI->getOperand(0);
643 MachineOperand &Dst1 = MI->getOperand(1);
644 assert(Dst0.isDef() && Dst1.isDef());
645
646 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
647
648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
650
651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
652
653 if (HaveNonDbgCarryUse) {
654 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
655 Dst1.getReg())
656 .addReg(AMDGPU::VCC, RegState::Kill);
657 }
658
659 // Keep the old instruction around to avoid breaking iterators, but
660 // replace it with a dummy instruction to remove uses.
661 //
662 // FIXME: We should not invert how this pass looks at operands to avoid
663 // this. Should track set of foldable movs instead of looking for uses
664 // when looking at a use.
665 Dst0.setReg(NewReg0);
666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
667 MI->removeOperand(I);
668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
669
670 if (Fold.Commuted)
671 TII->commuteInstruction(*Inst32, false);
672 return true;
673 }
674
675 assert(!Fold.needsShrink() && "not handled");
676
677 if (ImmVal) {
678 if (Old.isTied()) {
679 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
680 if (NewMFMAOpc == -1)
681 return false;
682 MI->setDesc(TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 }
685
686 // TODO: Should we try to avoid adding this to the candidate list?
687 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
688 int OpNo = MI->getOperandNo(&Old);
689 if (!TII->isOperandLegal(*MI, OpNo, &New))
690 return false;
691
692 Old.ChangeToImmediate(*ImmVal);
693 return true;
694 }
695
696 if (Fold.isGlobal()) {
697 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
698 Fold.Def.OpToFold->getOffset(),
699 Fold.Def.OpToFold->getTargetFlags());
700 return true;
701 }
702
703 if (Fold.isFI()) {
704 Old.ChangeToFrameIndex(Fold.getFI());
705 return true;
706 }
707
708 MachineOperand *New = Fold.Def.OpToFold;
709
710 // Verify the register is compatible with the operand.
711 if (const TargetRegisterClass *OpRC =
712 TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
713 const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
714 const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
715 unsigned NewSubReg = New->getSubReg();
716 unsigned OldSubReg = Old.getSubReg();
717
718 const TargetRegisterClass *ConstrainRC = OpRC;
719 if (NewSubReg && OldSubReg) {
720 unsigned PreA, PreB;
721 ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
722 NewSubReg, PreA, PreB);
723 } else if (OldSubReg) {
724 ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
725 } else if (NewSubReg) {
726 ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
727 }
728
729 if (!ConstrainRC)
730 return false;
731
732 if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
733 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
734 << TRI->getRegClassName(ConstrainRC) << '\n');
735 return false;
736 }
737 }
738
739 // Rework once the VS_16 register class is updated to include proper
740 // 16-bit SGPRs instead of 32-bit ones.
741 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
742 Old.setSubReg(AMDGPU::NoSubRegister);
743 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
744 Old.setIsUndef(New->isUndef());
745 return true;
746}
747
749 FoldCandidate &&Entry) {
750 // Skip additional folding on the same operand.
751 for (FoldCandidate &Fold : FoldList)
752 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
753 return;
754 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
755 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
756 FoldList.push_back(Entry);
757}
758
760 MachineInstr *MI, unsigned OpNo,
761 const FoldableDef &FoldOp,
762 bool Commuted = false, int ShrinkOp = -1) {
763 appendFoldCandidate(FoldList,
764 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
765}
766
767bool SIFoldOperandsImpl::tryAddToFoldList(
768 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
769 const FoldableDef &OpToFold) const {
770 const unsigned Opc = MI->getOpcode();
771
772 auto tryToFoldAsFMAAKorMK = [&]() {
773 if (!OpToFold.isImm())
774 return false;
775
776 const bool TryAK = OpNo == 3;
777 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
778 MI->setDesc(TII->get(NewOpc));
779
780 // We have to fold into operand which would be Imm not into OpNo.
781 bool FoldAsFMAAKorMK =
782 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
783 if (FoldAsFMAAKorMK) {
784 // Untie Src2 of fmac.
785 MI->untieRegOperand(3);
786 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
787 if (OpNo == 1) {
788 MachineOperand &Op1 = MI->getOperand(1);
789 MachineOperand &Op2 = MI->getOperand(2);
790 Register OldReg = Op1.getReg();
791 // Operand 2 might be an inlinable constant
792 if (Op2.isImm()) {
793 Op1.ChangeToImmediate(Op2.getImm());
794 Op2.ChangeToRegister(OldReg, false);
795 } else {
796 Op1.setReg(Op2.getReg());
797 Op2.setReg(OldReg);
798 }
799 }
800 return true;
801 }
802 MI->setDesc(TII->get(Opc));
803 return false;
804 };
805
806 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
807 if (!IsLegal && OpToFold.isImm()) {
808 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
809 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
810 }
811
812 if (!IsLegal) {
813 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
814 unsigned NewOpc = macToMad(Opc);
815 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
816 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
817 // to fold the operand.
818 MI->setDesc(TII->get(NewOpc));
819 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
820 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
821 if (AddOpSel)
822 MI->addOperand(MachineOperand::CreateImm(0));
823 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
824 if (FoldAsMAD) {
825 MI->untieRegOperand(OpNo);
826 return true;
827 }
828 if (AddOpSel)
829 MI->removeOperand(MI->getNumExplicitOperands() - 1);
830 MI->setDesc(TII->get(Opc));
831 }
832
833 // Special case for s_fmac_f32 if we are trying to fold into Src2.
834 // By transforming into fmaak we can untie Src2 and make folding legal.
835 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
836 if (tryToFoldAsFMAAKorMK())
837 return true;
838 }
839
840 // Special case for s_setreg_b32
841 if (OpToFold.isImm()) {
842 unsigned ImmOpc = 0;
843 if (Opc == AMDGPU::S_SETREG_B32)
844 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
845 else if (Opc == AMDGPU::S_SETREG_B32_mode)
846 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
847 if (ImmOpc) {
848 MI->setDesc(TII->get(ImmOpc));
849 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
850 return true;
851 }
852 }
853
854 // Operand is not legal, so try to commute the instruction to
855 // see if this makes it possible to fold.
856 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
857 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
858 if (!CanCommute)
859 return false;
860
861 MachineOperand &Op = MI->getOperand(OpNo);
862 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
863
864 // One of operands might be an Imm operand, and OpNo may refer to it after
865 // the call of commuteInstruction() below. Such situations are avoided
866 // here explicitly as OpNo must be a register operand to be a candidate
867 // for memory folding.
868 if (!Op.isReg() || !CommutedOp.isReg())
869 return false;
870
871 // The same situation with an immediate could reproduce if both inputs are
872 // the same register.
873 if (Op.isReg() && CommutedOp.isReg() &&
874 (Op.getReg() == CommutedOp.getReg() &&
875 Op.getSubReg() == CommutedOp.getSubReg()))
876 return false;
877
878 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
879 return false;
880
881 int Op32 = -1;
882 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
883 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
884 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
885 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
886 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
887 return false;
888 }
889
890 // Verify the other operand is a VGPR, otherwise we would violate the
891 // constant bus restriction.
892 MachineOperand &OtherOp = MI->getOperand(OpNo);
893 if (!OtherOp.isReg() ||
894 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
895 return false;
896
897 assert(MI->getOperand(1).isDef());
898
899 // Make sure to get the 32-bit version of the commuted opcode.
900 unsigned MaybeCommutedOpc = MI->getOpcode();
901 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
902 }
903
904 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
905 Op32);
906 return true;
907 }
908
909 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
910 // By changing into fmamk we can untie Src2.
911 // If folding for Src0 happens first and it is identical operand to Src1 we
912 // should avoid transforming into fmamk which requires commuting as it would
913 // cause folding into Src1 to fail later on due to wrong OpNo used.
914 if (Opc == AMDGPU::S_FMAC_F32 &&
915 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
916 if (tryToFoldAsFMAAKorMK())
917 return true;
918 }
919
920 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
921 return true;
922}
923
924bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
925 const MachineOperand &UseMO) const {
926 // Operands of SDWA instructions must be registers.
927 return !TII->isSDWA(MI);
928}
929
932 Register SrcReg) {
933 MachineOperand *Sub = nullptr;
934 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
935 SubDef && TII.isFoldableCopy(*SubDef);
936 SubDef = MRI.getVRegDef(Sub->getReg())) {
937 MachineOperand &SrcOp = SubDef->getOperand(1);
938 if (SrcOp.isImm())
939 return &SrcOp;
940 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
941 break;
942 Sub = &SrcOp;
943 // TODO: Support compose
944 if (SrcOp.getSubReg())
945 break;
946 }
947
948 return Sub;
949}
950
951const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
952 MachineInstr &RegSeq,
953 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
954
955 assert(RegSeq.isRegSequence());
956
957 const TargetRegisterClass *RC = nullptr;
958
959 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
960 MachineOperand &SrcOp = RegSeq.getOperand(I);
961 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
962
963 // Only accept reg_sequence with uniform reg class inputs for simplicity.
964 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
965 if (!RC)
966 RC = OpRC;
967 else if (!TRI->getCommonSubClass(RC, OpRC))
968 return nullptr;
969
970 if (SrcOp.getSubReg()) {
971 // TODO: Handle subregister compose
972 Defs.emplace_back(&SrcOp, SubRegIdx);
973 continue;
974 }
975
976 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
977 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
978 Defs.emplace_back(DefSrc, SubRegIdx);
979 continue;
980 }
981
982 Defs.emplace_back(&SrcOp, SubRegIdx);
983 }
984
985 return RC;
986}
987
988// Find a def of the UseReg, check if it is a reg_sequence and find initializers
989// for each subreg, tracking it to an immediate if possible. Returns the
990// register class of the inputs on success.
991const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
992 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
993 Register UseReg) const {
994 MachineInstr *Def = MRI->getVRegDef(UseReg);
995 if (!Def || !Def->isRegSequence())
996 return nullptr;
997
998 return getRegSeqInit(*Def, Defs);
999}
1000
1001std::pair<int64_t, const TargetRegisterClass *>
1002SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1004 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1005 if (!SrcRC)
1006 return {};
1007
1008 bool TryToMatchSplat64 = false;
1009
1010 int64_t Imm;
1011 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1012 const MachineOperand *Op = Defs[I].first;
1013 if (!Op->isImm())
1014 return {};
1015
1016 int64_t SubImm = Op->getImm();
1017 if (!I) {
1018 Imm = SubImm;
1019 continue;
1020 }
1021
1022 if (Imm != SubImm) {
1023 if (I == 1 && (E & 1) == 0) {
1024 // If we have an even number of inputs, there's a chance this is a
1025 // 64-bit element splat broken into 32-bit pieces.
1026 TryToMatchSplat64 = true;
1027 break;
1028 }
1029
1030 return {}; // Can only fold splat constants
1031 }
1032 }
1033
1034 if (!TryToMatchSplat64)
1035 return {Defs[0].first->getImm(), SrcRC};
1036
1037 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1038 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1039 int64_t SplatVal64;
1040 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1041 const MachineOperand *Op0 = Defs[I].first;
1042 const MachineOperand *Op1 = Defs[I + 1].first;
1043
1044 if (!Op0->isImm() || !Op1->isImm())
1045 return {};
1046
1047 unsigned SubReg0 = Defs[I].second;
1048 unsigned SubReg1 = Defs[I + 1].second;
1049
1050 // Assume we're going to generally encounter reg_sequences with sorted
1051 // subreg indexes, so reject any that aren't consecutive.
1052 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1053 TRI->getChannelFromSubReg(SubReg1))
1054 return {};
1055
1056 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1057 if (I == 0)
1058 SplatVal64 = MergedVal;
1059 else if (SplatVal64 != MergedVal)
1060 return {};
1061 }
1062
1063 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1064 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1065
1066 return {SplatVal64, RC64};
1067}
1068
1069bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1070 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1071 const TargetRegisterClass *SplatRC) const {
1072 const MCInstrDesc &Desc = UseMI->getDesc();
1073 if (UseOpIdx >= Desc.getNumOperands())
1074 return false;
1075
1076 // Filter out unhandled pseudos.
1077 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1078 return false;
1079
1080 int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
1081 if (RCID == -1)
1082 return false;
1083
1084 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1085
1086 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1087 // have the same bits. These are the only cases where a splat has the same
1088 // interpretation for 32-bit and 64-bit splats.
1089 if (SplatVal != 0 && SplatVal != -1) {
1090 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1091 // operand will be AReg_128, and we want to check if it's compatible with an
1092 // AReg_32 constant.
1093 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1094 switch (OpTy) {
1099 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1100 break;
1104 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1105 break;
1106 default:
1107 return false;
1108 }
1109
1110 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1111 return false;
1112 }
1113
1114 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1115 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1116 return false;
1117
1118 return true;
1119}
1120
1121bool SIFoldOperandsImpl::tryToFoldACImm(
1122 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1123 SmallVectorImpl<FoldCandidate> &FoldList) const {
1124 const MCInstrDesc &Desc = UseMI->getDesc();
1125 if (UseOpIdx >= Desc.getNumOperands())
1126 return false;
1127
1128 // Filter out unhandled pseudos.
1129 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1130 return false;
1131
1132 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
1133 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1134 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1135 return true;
1136 }
1137
1138 // TODO: Verify the following code handles subregisters correctly.
1139 // TODO: Handle extract of global reference
1140 if (UseOp.getSubReg())
1141 return false;
1142
1143 if (!OpToFold.isReg())
1144 return false;
1145
1146 Register UseReg = OpToFold.getReg();
1147 if (!UseReg.isVirtual())
1148 return false;
1149
1150 // Maybe it is just a COPY of an immediate itself.
1151
1152 // FIXME: Remove this handling. There is already special case folding of
1153 // immediate into copy in foldOperand. This is looking for the def of the
1154 // value the folding started from in the first place.
1155 MachineInstr *Def = MRI->getVRegDef(UseReg);
1156 if (Def && TII->isFoldableCopy(*Def)) {
1157 MachineOperand &DefOp = Def->getOperand(1);
1158 if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
1159 FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
1160 OpToFold.DefSubReg);
1161 appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
1162 return true;
1163 }
1164 }
1165
1166 return false;
1167}
1168
1169void SIFoldOperandsImpl::foldOperand(
1170 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1171 SmallVectorImpl<FoldCandidate> &FoldList,
1172 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1173 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1174
1175 if (!isUseSafeToFold(*UseMI, *UseOp))
1176 return;
1177
1178 // FIXME: Fold operands with subregs.
1179 if (UseOp->isReg() && OpToFold.isReg()) {
1180 if (UseOp->isImplicit())
1181 return;
1182 // Allow folding from SGPRs to 16-bit VGPRs.
1183 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1184 (UseOp->getSubReg() != AMDGPU::lo16 ||
1185 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1186 return;
1187 }
1188
1189 // Special case for REG_SEQUENCE: We can't fold literals into
1190 // REG_SEQUENCE instructions, so we have to fold them into the
1191 // uses of REG_SEQUENCE.
1192 if (UseMI->isRegSequence()) {
1193 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1194 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1195
1196 int64_t SplatVal;
1197 const TargetRegisterClass *SplatRC;
1198 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1199
1200 // Grab the use operands first
1202 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1203 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1204 MachineOperand *RSUse = UsesToProcess[I];
1205 MachineInstr *RSUseMI = RSUse->getParent();
1206 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1207
1208 if (SplatRC) {
1209 if (RSUseMI->isCopy()) {
1210 Register DstReg = RSUseMI->getOperand(0).getReg();
1211 append_range(UsesToProcess,
1212 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1213 continue;
1214 }
1215 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1216 FoldableDef SplatDef(SplatVal, SplatRC);
1217 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1218 continue;
1219 }
1220 }
1221
1222 // TODO: Handle general compose
1223 if (RSUse->getSubReg() != RegSeqDstSubReg)
1224 continue;
1225
1226 // FIXME: We should avoid recursing here. There should be a cleaner split
1227 // between the in-place mutations and adding to the fold list.
1228 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1229 CopiesToReplace);
1230 }
1231
1232 return;
1233 }
1234
1235 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1236 return;
1237
1238 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1239 // Verify that this is a stack access.
1240 // FIXME: Should probably use stack pseudos before frame lowering.
1241
1242 if (TII->isMUBUF(*UseMI)) {
1243 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1244 MFI->getScratchRSrcReg())
1245 return;
1246
1247 // Ensure this is either relative to the current frame or the current
1248 // wave.
1249 MachineOperand &SOff =
1250 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1251 if (!SOff.isImm() || SOff.getImm() != 0)
1252 return;
1253 }
1254
1255 const unsigned Opc = UseMI->getOpcode();
1256 if (TII->isFLATScratch(*UseMI) &&
1257 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1258 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1259 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1260 unsigned CPol =
1261 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1262 if ((CPol & AMDGPU::CPol::SCAL) &&
1264 return;
1265
1266 UseMI->setDesc(TII->get(NewOpc));
1267 }
1268
1269 // A frame index will resolve to a positive constant, so it should always be
1270 // safe to fold the addressing mode, even pre-GFX9.
1271 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1272
1273 return;
1274 }
1275
1276 bool FoldingImmLike =
1277 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1278
1279 if (FoldingImmLike && UseMI->isCopy()) {
1280 Register DestReg = UseMI->getOperand(0).getReg();
1281 Register SrcReg = UseMI->getOperand(1).getReg();
1282 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1283 assert(SrcReg.isVirtual());
1284
1285 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1286
1287 // Don't fold into a copy to a physical register with the same class. Doing
1288 // so would interfere with the register coalescer's logic which would avoid
1289 // redundant initializations.
1290 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1291 return;
1292
1293 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1294 // In order to fold immediates into copies, we need to change the copy to a
1295 // MOV. Find a compatible mov instruction with the value.
1296 for (unsigned MovOp :
1297 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1298 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1299 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1300 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1301 const MCInstrDesc &MovDesc = TII->get(MovOp);
1302 assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
1303
1304 const TargetRegisterClass *MovDstRC =
1305 TRI->getRegClass(MovDesc.operands()[0].RegClass);
1306
1307 // Fold if the destination register class of the MOV instruction (ResRC)
1308 // is a superclass of (or equal to) the destination register class of the
1309 // COPY (DestRC). If this condition fails, folding would be illegal.
1310 if (!DestRC->hasSuperClassEq(MovDstRC))
1311 continue;
1312
1313 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1314 const TargetRegisterClass *MovSrcRC =
1315 TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
1316 if (MovSrcRC) {
1317 if (UseSubReg)
1318 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1319 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1320 break;
1321
1322 // FIXME: This is mutating the instruction only and deferring the actual
1323 // fold of the immediate
1324 } else {
1325 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1326 // immediate to verify. Technically we should always verify this, but it
1327 // only matters for these concrete cases.
1328 // TODO: Handle non-imm case if it's useful.
1329 if (!OpToFold.isImm() ||
1330 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1331 break;
1332 }
1333
1336 while (ImpOpI != ImpOpE) {
1337 MachineInstr::mop_iterator Tmp = ImpOpI;
1338 ImpOpI++;
1340 }
1341 UseMI->setDesc(MovDesc);
1342
1343 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1344 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1345 MachineOperand NewSrcOp(SrcOp);
1346 MachineFunction *MF = UseMI->getParent()->getParent();
1347 UseMI->removeOperand(1);
1348 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1349 UseMI->addOperand(NewSrcOp); // src0
1350 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1351 UseOpIdx = SrcIdx;
1352 UseOp = &UseMI->getOperand(UseOpIdx);
1353 }
1354 CopiesToReplace.push_back(UseMI);
1355 break;
1356 }
1357
1358 // We failed to replace the copy, so give up.
1359 if (UseMI->getOpcode() == AMDGPU::COPY)
1360 return;
1361
1362 } else {
1363 if (UseMI->isCopy() && OpToFold.isReg() &&
1364 UseMI->getOperand(0).getReg().isVirtual() &&
1365 !UseMI->getOperand(1).getSubReg() &&
1366 OpToFold.DefMI->implicit_operands().empty()) {
1367 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1368 << *UseMI);
1369 unsigned Size = TII->getOpSize(*UseMI, 1);
1370 Register UseReg = OpToFold.getReg();
1372 unsigned SubRegIdx = OpToFold.getSubReg();
1373 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1374 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1375 // VS_16RegClass
1376 //
1377 // Excerpt from AMDGPUGenRegisterInfo.inc
1378 // NoSubRegister, //0
1379 // hi16, // 1
1380 // lo16, // 2
1381 // sub0, // 3
1382 // ...
1383 // sub1, // 11
1384 // sub1_hi16, // 12
1385 // sub1_lo16, // 13
1386 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1387 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1388 TRI->isSGPRReg(*MRI, UseReg)) {
1389 // Produce the 32 bit subregister index to which the 16-bit subregister
1390 // is aligned.
1391 if (SubRegIdx > AMDGPU::sub1) {
1392 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1393 M |= M.getLane(M.getHighestLane() - 1);
1394 SmallVector<unsigned, 4> Indexes;
1395 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1396 Indexes);
1397 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1398 SubRegIdx = Indexes[0];
1399 // 32-bit registers do not have a sub0 index
1400 } else if (TII->getOpSize(*UseMI, 1) == 4)
1401 SubRegIdx = 0;
1402 else
1403 SubRegIdx = AMDGPU::sub0;
1404 }
1405 UseMI->getOperand(1).setSubReg(SubRegIdx);
1406 UseMI->getOperand(1).setIsKill(false);
1407 CopiesToReplace.push_back(UseMI);
1408 OpToFold.OpToFold->setIsKill(false);
1409
1410 // Remove kill flags as kills may now be out of order with uses.
1411 MRI->clearKillFlags(UseReg);
1412 if (foldCopyToAGPRRegSequence(UseMI))
1413 return;
1414 }
1415
1416 unsigned UseOpc = UseMI->getOpcode();
1417 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1418 (UseOpc == AMDGPU::V_READLANE_B32 &&
1419 (int)UseOpIdx ==
1420 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1421 // %vgpr = V_MOV_B32 imm
1422 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1423 // =>
1424 // %sgpr = S_MOV_B32 imm
1425 if (FoldingImmLike) {
1427 UseMI->getOperand(UseOpIdx).getReg(),
1428 *OpToFold.DefMI, *UseMI))
1429 return;
1430
1431 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1432
1433 if (OpToFold.isImm()) {
1435 *OpToFold.getEffectiveImmVal());
1436 } else if (OpToFold.isFI())
1437 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1438 else {
1439 assert(OpToFold.isGlobal());
1440 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1441 OpToFold.OpToFold->getOffset(),
1442 OpToFold.OpToFold->getTargetFlags());
1443 }
1444 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1445 return;
1446 }
1447
1448 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1450 UseMI->getOperand(UseOpIdx).getReg(),
1451 *OpToFold.DefMI, *UseMI))
1452 return;
1453
1454 // %vgpr = COPY %sgpr0
1455 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1456 // =>
1457 // %sgpr1 = COPY %sgpr0
1458 UseMI->setDesc(TII->get(AMDGPU::COPY));
1459 UseMI->getOperand(1).setReg(OpToFold.getReg());
1460 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1461 UseMI->getOperand(1).setIsKill(false);
1462 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1463 return;
1464 }
1465 }
1466
1467 const MCInstrDesc &UseDesc = UseMI->getDesc();
1468
1469 // Don't fold into target independent nodes. Target independent opcodes
1470 // don't have defined register classes.
1471 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1472 UseDesc.operands()[UseOpIdx].RegClass == -1)
1473 return;
1474 }
1475
1476 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1477 // to enable more folding opportunities. The shrink operands pass
1478 // already does this.
1479
1480 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1481}
1482
1483static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1485 switch (Opcode) {
1486 case AMDGPU::V_AND_B32_e64:
1487 case AMDGPU::V_AND_B32_e32:
1488 case AMDGPU::S_AND_B32:
1489 Result = LHS & RHS;
1490 return true;
1491 case AMDGPU::V_OR_B32_e64:
1492 case AMDGPU::V_OR_B32_e32:
1493 case AMDGPU::S_OR_B32:
1494 Result = LHS | RHS;
1495 return true;
1496 case AMDGPU::V_XOR_B32_e64:
1497 case AMDGPU::V_XOR_B32_e32:
1498 case AMDGPU::S_XOR_B32:
1499 Result = LHS ^ RHS;
1500 return true;
1501 case AMDGPU::S_XNOR_B32:
1502 Result = ~(LHS ^ RHS);
1503 return true;
1504 case AMDGPU::S_NAND_B32:
1505 Result = ~(LHS & RHS);
1506 return true;
1507 case AMDGPU::S_NOR_B32:
1508 Result = ~(LHS | RHS);
1509 return true;
1510 case AMDGPU::S_ANDN2_B32:
1511 Result = LHS & ~RHS;
1512 return true;
1513 case AMDGPU::S_ORN2_B32:
1514 Result = LHS | ~RHS;
1515 return true;
1516 case AMDGPU::V_LSHL_B32_e64:
1517 case AMDGPU::V_LSHL_B32_e32:
1518 case AMDGPU::S_LSHL_B32:
1519 // The instruction ignores the high bits for out of bounds shifts.
1520 Result = LHS << (RHS & 31);
1521 return true;
1522 case AMDGPU::V_LSHLREV_B32_e64:
1523 case AMDGPU::V_LSHLREV_B32_e32:
1524 Result = RHS << (LHS & 31);
1525 return true;
1526 case AMDGPU::V_LSHR_B32_e64:
1527 case AMDGPU::V_LSHR_B32_e32:
1528 case AMDGPU::S_LSHR_B32:
1529 Result = LHS >> (RHS & 31);
1530 return true;
1531 case AMDGPU::V_LSHRREV_B32_e64:
1532 case AMDGPU::V_LSHRREV_B32_e32:
1533 Result = RHS >> (LHS & 31);
1534 return true;
1535 case AMDGPU::V_ASHR_I32_e64:
1536 case AMDGPU::V_ASHR_I32_e32:
1537 case AMDGPU::S_ASHR_I32:
1538 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1539 return true;
1540 case AMDGPU::V_ASHRREV_I32_e64:
1541 case AMDGPU::V_ASHRREV_I32_e32:
1542 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1543 return true;
1544 default:
1545 return false;
1546 }
1547}
1548
1549static unsigned getMovOpc(bool IsScalar) {
1550 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1551}
1552
1553static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1554 MI.setDesc(NewDesc);
1555
1556 // Remove any leftover implicit operands from mutating the instruction. e.g.
1557 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1558 // anymore.
1559 const MCInstrDesc &Desc = MI.getDesc();
1560 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1561 Desc.implicit_defs().size();
1562
1563 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1564 MI.removeOperand(I);
1565}
1566
1567std::optional<int64_t>
1568SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1569 if (Op.isImm())
1570 return Op.getImm();
1571
1572 if (!Op.isReg() || !Op.getReg().isVirtual())
1573 return std::nullopt;
1574
1575 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1576 if (Def && Def->isMoveImmediate()) {
1577 const MachineOperand &ImmSrc = Def->getOperand(1);
1578 if (ImmSrc.isImm())
1579 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1580 }
1581
1582 return std::nullopt;
1583}
1584
1585// Try to simplify operations with a constant that may appear after instruction
1586// selection.
1587// TODO: See if a frame index with a fixed offset can fold.
1588bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1589 if (!MI->allImplicitDefsAreDead())
1590 return false;
1591
1592 unsigned Opc = MI->getOpcode();
1593
1594 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1595 if (Src0Idx == -1)
1596 return false;
1597
1598 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1599 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1600
1601 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1602 Opc == AMDGPU::S_NOT_B32) &&
1603 Src0Imm) {
1604 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1605 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1606 return true;
1607 }
1608
1609 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1610 if (Src1Idx == -1)
1611 return false;
1612
1613 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1614 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1615
1616 if (!Src0Imm && !Src1Imm)
1617 return false;
1618
1619 // and k0, k1 -> v_mov_b32 (k0 & k1)
1620 // or k0, k1 -> v_mov_b32 (k0 | k1)
1621 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1622 if (Src0Imm && Src1Imm) {
1623 int32_t NewImm;
1624 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1625 return false;
1626
1627 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1628
1629 // Be careful to change the right operand, src0 may belong to a different
1630 // instruction.
1631 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1632 MI->removeOperand(Src1Idx);
1633 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1634 return true;
1635 }
1636
1637 if (!MI->isCommutable())
1638 return false;
1639
1640 if (Src0Imm && !Src1Imm) {
1641 std::swap(Src0, Src1);
1642 std::swap(Src0Idx, Src1Idx);
1643 std::swap(Src0Imm, Src1Imm);
1644 }
1645
1646 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1647 if (Opc == AMDGPU::V_OR_B32_e64 ||
1648 Opc == AMDGPU::V_OR_B32_e32 ||
1649 Opc == AMDGPU::S_OR_B32) {
1650 if (Src1Val == 0) {
1651 // y = or x, 0 => y = copy x
1652 MI->removeOperand(Src1Idx);
1653 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1654 } else if (Src1Val == -1) {
1655 // y = or x, -1 => y = v_mov_b32 -1
1656 MI->removeOperand(Src1Idx);
1657 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1658 } else
1659 return false;
1660
1661 return true;
1662 }
1663
1664 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1665 Opc == AMDGPU::S_AND_B32) {
1666 if (Src1Val == 0) {
1667 // y = and x, 0 => y = v_mov_b32 0
1668 MI->removeOperand(Src0Idx);
1669 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1670 } else if (Src1Val == -1) {
1671 // y = and x, -1 => y = copy x
1672 MI->removeOperand(Src1Idx);
1673 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1674 } else
1675 return false;
1676
1677 return true;
1678 }
1679
1680 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1681 Opc == AMDGPU::S_XOR_B32) {
1682 if (Src1Val == 0) {
1683 // y = xor x, 0 => y = copy x
1684 MI->removeOperand(Src1Idx);
1685 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1686 return true;
1687 }
1688 }
1689
1690 return false;
1691}
1692
1693// Try to fold an instruction into a simpler one
1694bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1695 unsigned Opc = MI.getOpcode();
1696 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1697 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1698 return false;
1699
1700 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1701 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1702 if (!Src1->isIdenticalTo(*Src0)) {
1703 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1704 if (!Src1Imm)
1705 return false;
1706
1707 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1708 if (!Src0Imm || *Src0Imm != *Src1Imm)
1709 return false;
1710 }
1711
1712 int Src1ModIdx =
1713 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1714 int Src0ModIdx =
1715 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1716 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1717 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1718 return false;
1719
1720 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1721 auto &NewDesc =
1722 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1723 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1724 if (Src2Idx != -1)
1725 MI.removeOperand(Src2Idx);
1726 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1727 if (Src1ModIdx != -1)
1728 MI.removeOperand(Src1ModIdx);
1729 if (Src0ModIdx != -1)
1730 MI.removeOperand(Src0ModIdx);
1731 mutateCopyOp(MI, NewDesc);
1732 LLVM_DEBUG(dbgs() << MI);
1733 return true;
1734}
1735
1736bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1737 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1738 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1739 return false;
1740
1741 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
1742 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1743 return false;
1744
1745 Register Src1 = MI.getOperand(2).getReg();
1746 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1747 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1748 return false;
1749
1750 Register Dst = MI.getOperand(0).getReg();
1751 MRI->replaceRegWith(Dst, Src1);
1752 if (!MI.getOperand(2).isKill())
1753 MRI->clearKillFlags(Src1);
1754 MI.eraseFromParent();
1755 return true;
1756}
1757
1758bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1759 const FoldableDef &OpToFold) const {
1760 // We need mutate the operands of new mov instructions to add implicit
1761 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1762 // this.
1763 SmallVector<MachineInstr *, 4> CopiesToReplace;
1765 MachineOperand &Dst = MI.getOperand(0);
1766 bool Changed = false;
1767
1768 if (OpToFold.isImm()) {
1769 for (auto &UseMI :
1770 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1771 // Folding the immediate may reveal operations that can be constant
1772 // folded or replaced with a copy. This can happen for example after
1773 // frame indices are lowered to constants or from splitting 64-bit
1774 // constants.
1775 //
1776 // We may also encounter cases where one or both operands are
1777 // immediates materialized into a register, which would ordinarily not
1778 // be folded due to multiple uses or operand constraints.
1779 if (tryConstantFoldOp(&UseMI)) {
1780 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1781 Changed = true;
1782 }
1783 }
1784 }
1785
1787 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1788 for (auto *U : UsesToProcess) {
1789 MachineInstr *UseMI = U->getParent();
1790
1791 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1792 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1793 CopiesToReplace);
1794 }
1795
1796 if (CopiesToReplace.empty() && FoldList.empty())
1797 return Changed;
1798
1799 MachineFunction *MF = MI.getParent()->getParent();
1800 // Make sure we add EXEC uses to any new v_mov instructions created.
1801 for (MachineInstr *Copy : CopiesToReplace)
1802 Copy->addImplicitDefUseOperands(*MF);
1803
1804 SetVector<MachineInstr *> ConstantFoldCandidates;
1805 for (FoldCandidate &Fold : FoldList) {
1806 assert(!Fold.isReg() || Fold.Def.OpToFold);
1807 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1808 Register Reg = Fold.getReg();
1809 const MachineInstr *DefMI = Fold.Def.DefMI;
1810 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1811 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1812 continue;
1813 }
1814 if (updateOperand(Fold)) {
1815 // Clear kill flags.
1816 if (Fold.isReg()) {
1817 assert(Fold.Def.OpToFold && Fold.isReg());
1818 // FIXME: Probably shouldn't bother trying to fold if not an
1819 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1820 // copies.
1821 MRI->clearKillFlags(Fold.getReg());
1822 }
1823 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1824 << static_cast<int>(Fold.UseOpNo) << " of "
1825 << *Fold.UseMI);
1826
1827 if (Fold.isImm())
1828 ConstantFoldCandidates.insert(Fold.UseMI);
1829
1830 } else if (Fold.Commuted) {
1831 // Restoring instruction's original operand order if fold has failed.
1832 TII->commuteInstruction(*Fold.UseMI, false);
1833 }
1834 }
1835
1836 for (MachineInstr *MI : ConstantFoldCandidates) {
1837 if (tryConstantFoldOp(MI)) {
1838 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1839 Changed = true;
1840 }
1841 }
1842 return true;
1843}
1844
1845/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1846/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1847bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1848 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1849 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1850 // initializers right here, so we will rematerialize immediates and avoid
1851 // copies via different reg classes.
1852 const TargetRegisterClass *DefRC =
1853 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1854 if (!TRI->isAGPRClass(DefRC))
1855 return false;
1856
1857 Register UseReg = CopyMI->getOperand(1).getReg();
1858 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1859 if (!RegSeq || !RegSeq->isRegSequence())
1860 return false;
1861
1862 const DebugLoc &DL = CopyMI->getDebugLoc();
1863 MachineBasicBlock &MBB = *CopyMI->getParent();
1864
1865 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1866 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1867
1868 const TargetRegisterClass *UseRC =
1869 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1870
1871 // Value, subregindex for new REG_SEQUENCE
1873
1874 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1875 unsigned NumFoldable = 0;
1876
1877 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1878 MachineOperand &RegOp = RegSeq->getOperand(I);
1879 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1880
1881 if (RegOp.getSubReg()) {
1882 // TODO: Handle subregister compose
1883 NewDefs.emplace_back(&RegOp, SubRegIdx);
1884 continue;
1885 }
1886
1887 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1888 if (!Lookup)
1889 Lookup = &RegOp;
1890
1891 if (Lookup->isImm()) {
1892 // Check if this is an agpr_32 subregister.
1893 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1894 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1895 if (DestSuperRC &&
1896 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1897 ++NumFoldable;
1898 NewDefs.emplace_back(Lookup, SubRegIdx);
1899 continue;
1900 }
1901 }
1902
1903 const TargetRegisterClass *InputRC =
1904 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1905 : MRI->getRegClass(RegOp.getReg());
1906
1907 // TODO: Account for Lookup->getSubReg()
1908
1909 // If we can't find a matching super class, this is an SGPR->AGPR or
1910 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1911 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1912 // want to rewrite to copy to an intermediate VGPR class.
1913 const TargetRegisterClass *MatchRC =
1914 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1915 if (!MatchRC) {
1916 ++NumFoldable;
1917 NewDefs.emplace_back(&RegOp, SubRegIdx);
1918 continue;
1919 }
1920
1921 NewDefs.emplace_back(&RegOp, SubRegIdx);
1922 }
1923
1924 // Do not clone a reg_sequence and merely change the result register class.
1925 if (NumFoldable == 0)
1926 return false;
1927
1928 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1929 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1930 CopyMI->removeOperand(I);
1931
1932 for (auto [Def, DestSubIdx] : NewDefs) {
1933 if (!Def->isReg()) {
1934 // TODO: Should we use single write for each repeated value like in
1935 // register case?
1936 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1937 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1938 .add(*Def);
1939 B.addReg(Tmp);
1940 } else {
1941 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1942 Def->setIsKill(false);
1943
1944 Register &VGPRCopy = VGPRCopies[Src];
1945 if (!VGPRCopy) {
1946 const TargetRegisterClass *VGPRUseSubRC =
1947 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1948
1949 // We cannot build a reg_sequence out of the same registers, they
1950 // must be copied. Better do it here before copyPhysReg() created
1951 // several reads to do the AGPR->VGPR->AGPR copy.
1952
1953 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1954 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1955 // later, create a copy here and track if we already have such a copy.
1956 const TargetRegisterClass *SubRC =
1957 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1958 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1959 // TODO: Try to reconstrain class
1960 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1961 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1962 B.addReg(VGPRCopy);
1963 } else {
1964 // If it is already a VGPR, do not copy the register.
1965 B.add(*Def);
1966 }
1967 } else {
1968 B.addReg(VGPRCopy);
1969 }
1970 }
1971
1972 B.addImm(DestSubIdx);
1973 }
1974
1975 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1976 return true;
1977}
1978
1979bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1980 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1981 Register DstReg = MI.getOperand(0).getReg();
1982 // Specially track simple redefs of m0 to the same value in a block, so we
1983 // can erase the later ones.
1984 if (DstReg == AMDGPU::M0) {
1985 MachineOperand &NewM0Val = MI.getOperand(1);
1986 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1987 MI.eraseFromParent();
1988 return true;
1989 }
1990
1991 // We aren't tracking other physical registers
1992 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1993 ? nullptr
1994 : &NewM0Val;
1995 return false;
1996 }
1997
1998 MachineOperand *OpToFoldPtr;
1999 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2000 // Folding when any src_modifiers are non-zero is unsupported
2001 if (TII->hasAnyModifiersSet(MI))
2002 return false;
2003 OpToFoldPtr = &MI.getOperand(2);
2004 } else
2005 OpToFoldPtr = &MI.getOperand(1);
2006 MachineOperand &OpToFold = *OpToFoldPtr;
2007 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2008
2009 // FIXME: We could also be folding things like TargetIndexes.
2010 if (!FoldingImm && !OpToFold.isReg())
2011 return false;
2012
2013 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
2014 return false;
2015
2016 // Prevent folding operands backwards in the function. For example,
2017 // the COPY opcode must not be replaced by 1 in this example:
2018 //
2019 // %3 = COPY %vgpr0; VGPR_32:%3
2020 // ...
2021 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2022 if (!DstReg.isVirtual())
2023 return false;
2024
2025 const TargetRegisterClass *DstRC =
2026 MRI->getRegClass(MI.getOperand(0).getReg());
2027
2028 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2029 // Can remove this code if proper 16-bit SGPRs are implemented
2030 // Example: Pre-peephole-opt
2031 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2032 // %32:sreg_32 = COPY %29:sgpr_lo16
2033 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2034 // Post-peephole-opt and DCE
2035 // %32:sreg_32 = COPY %16.lo16:sreg_32
2036 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2037 // After this transform
2038 // %32:sreg_32 = COPY %16:sreg_32
2039 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2040 // After the fold operands pass
2041 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2042 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2043 OpToFold.getSubReg()) {
2044 if (DstRC == &AMDGPU::SReg_32RegClass &&
2045 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2046 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2047 OpToFold.setSubReg(0);
2048 }
2049 }
2050
2051 // Fold copy to AGPR through reg_sequence
2052 // TODO: Handle with subregister extract
2053 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2054 if (foldCopyToAGPRRegSequence(&MI))
2055 return true;
2056 }
2057
2058 FoldableDef Def(OpToFold, DstRC);
2059 bool Changed = foldInstOperand(MI, Def);
2060
2061 // If we managed to fold all uses of this copy then we might as well
2062 // delete it now.
2063 // The only reason we need to follow chains of copies here is that
2064 // tryFoldRegSequence looks forward through copies before folding a
2065 // REG_SEQUENCE into its eventual users.
2066 auto *InstToErase = &MI;
2067 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2068 auto &SrcOp = InstToErase->getOperand(1);
2069 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2070 InstToErase->eraseFromParent();
2071 Changed = true;
2072 InstToErase = nullptr;
2073 if (!SrcReg || SrcReg.isPhysical())
2074 break;
2075 InstToErase = MRI->getVRegDef(SrcReg);
2076 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2077 break;
2078 }
2079
2080 if (InstToErase && InstToErase->isRegSequence() &&
2081 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2082 InstToErase->eraseFromParent();
2083 Changed = true;
2084 }
2085
2086 if (Changed)
2087 return true;
2088
2089 // Run this after foldInstOperand to avoid turning scalar additions into
2090 // vector additions when the result scalar result could just be folded into
2091 // the user(s).
2092 return OpToFold.isReg() &&
2093 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2094}
2095
2096// Clamp patterns are canonically selected to v_max_* instructions, so only
2097// handle them.
2098const MachineOperand *
2099SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2100 unsigned Op = MI.getOpcode();
2101 switch (Op) {
2102 case AMDGPU::V_MAX_F32_e64:
2103 case AMDGPU::V_MAX_F16_e64:
2104 case AMDGPU::V_MAX_F16_t16_e64:
2105 case AMDGPU::V_MAX_F16_fake16_e64:
2106 case AMDGPU::V_MAX_F64_e64:
2107 case AMDGPU::V_MAX_NUM_F64_e64:
2108 case AMDGPU::V_PK_MAX_F16:
2109 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2110 case AMDGPU::V_PK_MAX_NUM_BF16: {
2111 if (MI.mayRaiseFPException())
2112 return nullptr;
2113
2114 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2115 return nullptr;
2116
2117 // Make sure sources are identical.
2118 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2119 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2120 if (!Src0->isReg() || !Src1->isReg() ||
2121 Src0->getReg() != Src1->getReg() ||
2122 Src0->getSubReg() != Src1->getSubReg() ||
2123 Src0->getSubReg() != AMDGPU::NoSubRegister)
2124 return nullptr;
2125
2126 // Can't fold up if we have modifiers.
2127 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2128 return nullptr;
2129
2130 unsigned Src0Mods
2131 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2132 unsigned Src1Mods
2133 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2134
2135 // Having a 0 op_sel_hi would require swizzling the output in the source
2136 // instruction, which we can't do.
2137 unsigned UnsetMods =
2138 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2140 : 0u;
2141 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2142 return nullptr;
2143 return Src0;
2144 }
2145 default:
2146 return nullptr;
2147 }
2148}
2149
2150// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2151bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2152 const MachineOperand *ClampSrc = isClamp(MI);
2153 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2154 return false;
2155
2156 if (!ClampSrc->getReg().isVirtual())
2157 return false;
2158
2159 // Look through COPY. COPY only observed with True16.
2160 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2161 MachineInstr *Def =
2162 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2163
2164 // The type of clamp must be compatible.
2165 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2166 return false;
2167
2168 if (Def->mayRaiseFPException())
2169 return false;
2170
2171 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2172 if (!DefClamp)
2173 return false;
2174
2175 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2176
2177 // Clamp is applied after omod, so it is OK if omod is set.
2178 DefClamp->setImm(1);
2179
2180 Register DefReg = Def->getOperand(0).getReg();
2181 Register MIDstReg = MI.getOperand(0).getReg();
2182 if (TRI->isSGPRReg(*MRI, DefReg)) {
2183 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2184 // instruction with a VGPR dst.
2185 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2186 MIDstReg)
2187 .addReg(DefReg);
2188 } else {
2189 MRI->replaceRegWith(MIDstReg, DefReg);
2190 }
2191 MI.eraseFromParent();
2192
2193 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2194 // instruction, so we might as well convert it to the more flexible VOP3-only
2195 // mad/fma form.
2196 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2197 Def->eraseFromParent();
2198
2199 return true;
2200}
2201
2202static int getOModValue(unsigned Opc, int64_t Val) {
2203 switch (Opc) {
2204 case AMDGPU::V_MUL_F64_e64:
2205 case AMDGPU::V_MUL_F64_pseudo_e64: {
2206 switch (Val) {
2207 case 0x3fe0000000000000: // 0.5
2208 return SIOutMods::DIV2;
2209 case 0x4000000000000000: // 2.0
2210 return SIOutMods::MUL2;
2211 case 0x4010000000000000: // 4.0
2212 return SIOutMods::MUL4;
2213 default:
2214 return SIOutMods::NONE;
2215 }
2216 }
2217 case AMDGPU::V_MUL_F32_e64: {
2218 switch (static_cast<uint32_t>(Val)) {
2219 case 0x3f000000: // 0.5
2220 return SIOutMods::DIV2;
2221 case 0x40000000: // 2.0
2222 return SIOutMods::MUL2;
2223 case 0x40800000: // 4.0
2224 return SIOutMods::MUL4;
2225 default:
2226 return SIOutMods::NONE;
2227 }
2228 }
2229 case AMDGPU::V_MUL_F16_e64:
2230 case AMDGPU::V_MUL_F16_t16_e64:
2231 case AMDGPU::V_MUL_F16_fake16_e64: {
2232 switch (static_cast<uint16_t>(Val)) {
2233 case 0x3800: // 0.5
2234 return SIOutMods::DIV2;
2235 case 0x4000: // 2.0
2236 return SIOutMods::MUL2;
2237 case 0x4400: // 4.0
2238 return SIOutMods::MUL4;
2239 default:
2240 return SIOutMods::NONE;
2241 }
2242 }
2243 default:
2244 llvm_unreachable("invalid mul opcode");
2245 }
2246}
2247
2248// FIXME: Does this really not support denormals with f16?
2249// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2250// handled, so will anything other than that break?
2251std::pair<const MachineOperand *, int>
2252SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2253 unsigned Op = MI.getOpcode();
2254 switch (Op) {
2255 case AMDGPU::V_MUL_F64_e64:
2256 case AMDGPU::V_MUL_F64_pseudo_e64:
2257 case AMDGPU::V_MUL_F32_e64:
2258 case AMDGPU::V_MUL_F16_t16_e64:
2259 case AMDGPU::V_MUL_F16_fake16_e64:
2260 case AMDGPU::V_MUL_F16_e64: {
2261 // If output denormals are enabled, omod is ignored.
2262 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2264 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2265 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2266 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2269 MI.mayRaiseFPException())
2270 return std::pair(nullptr, SIOutMods::NONE);
2271
2272 const MachineOperand *RegOp = nullptr;
2273 const MachineOperand *ImmOp = nullptr;
2274 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2275 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2276 if (Src0->isImm()) {
2277 ImmOp = Src0;
2278 RegOp = Src1;
2279 } else if (Src1->isImm()) {
2280 ImmOp = Src1;
2281 RegOp = Src0;
2282 } else
2283 return std::pair(nullptr, SIOutMods::NONE);
2284
2285 int OMod = getOModValue(Op, ImmOp->getImm());
2286 if (OMod == SIOutMods::NONE ||
2287 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2288 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2289 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2290 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2291 return std::pair(nullptr, SIOutMods::NONE);
2292
2293 return std::pair(RegOp, OMod);
2294 }
2295 case AMDGPU::V_ADD_F64_e64:
2296 case AMDGPU::V_ADD_F64_pseudo_e64:
2297 case AMDGPU::V_ADD_F32_e64:
2298 case AMDGPU::V_ADD_F16_e64:
2299 case AMDGPU::V_ADD_F16_t16_e64:
2300 case AMDGPU::V_ADD_F16_fake16_e64: {
2301 // If output denormals are enabled, omod is ignored.
2302 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2304 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2305 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2306 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2308 return std::pair(nullptr, SIOutMods::NONE);
2309
2310 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2311 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2312 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2313
2314 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2315 Src0->getSubReg() == Src1->getSubReg() &&
2316 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2317 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2318 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2319 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2320 return std::pair(Src0, SIOutMods::MUL2);
2321
2322 return std::pair(nullptr, SIOutMods::NONE);
2323 }
2324 default:
2325 return std::pair(nullptr, SIOutMods::NONE);
2326 }
2327}
2328
2329// FIXME: Does this need to check IEEE bit on function?
2330bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2331 const MachineOperand *RegOp;
2332 int OMod;
2333 std::tie(RegOp, OMod) = isOMod(MI);
2334 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2335 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2336 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2337 return false;
2338
2339 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2340 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2341 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2342 return false;
2343
2344 if (Def->mayRaiseFPException())
2345 return false;
2346
2347 // Clamp is applied after omod. If the source already has clamp set, don't
2348 // fold it.
2349 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2350 return false;
2351
2352 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2353
2354 DefOMod->setImm(OMod);
2355 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2356 // Kill flags can be wrong if we replaced a def inside a loop with a def
2357 // outside the loop.
2358 MRI->clearKillFlags(Def->getOperand(0).getReg());
2359 MI.eraseFromParent();
2360
2361 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2362 // instruction, so we might as well convert it to the more flexible VOP3-only
2363 // mad/fma form.
2364 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2365 Def->eraseFromParent();
2366
2367 return true;
2368}
2369
2370// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2371// instruction which can take an agpr. So far that means a store.
2372bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2373 assert(MI.isRegSequence());
2374 auto Reg = MI.getOperand(0).getReg();
2375
2376 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2377 !MRI->hasOneNonDBGUse(Reg))
2378 return false;
2379
2381 if (!getRegSeqInit(Defs, Reg))
2382 return false;
2383
2384 for (auto &[Op, SubIdx] : Defs) {
2385 if (!Op->isReg())
2386 return false;
2387 if (TRI->isAGPR(*MRI, Op->getReg()))
2388 continue;
2389 // Maybe this is a COPY from AREG
2390 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2391 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2392 return false;
2393 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2394 return false;
2395 }
2396
2397 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2398 MachineInstr *UseMI = Op->getParent();
2399 while (UseMI->isCopy() && !Op->getSubReg()) {
2400 Reg = UseMI->getOperand(0).getReg();
2401 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2402 return false;
2403 Op = &*MRI->use_nodbg_begin(Reg);
2404 UseMI = Op->getParent();
2405 }
2406
2407 if (Op->getSubReg())
2408 return false;
2409
2410 unsigned OpIdx = Op - &UseMI->getOperand(0);
2411 const MCInstrDesc &InstDesc = UseMI->getDesc();
2412 const TargetRegisterClass *OpRC =
2413 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
2414 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2415 return false;
2416
2417 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2418 auto Dst = MRI->createVirtualRegister(NewDstRC);
2419 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2420 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2421
2422 for (auto &[Def, SubIdx] : Defs) {
2423 Def->setIsKill(false);
2424 if (TRI->isAGPR(*MRI, Def->getReg())) {
2425 RS.add(*Def);
2426 } else { // This is a copy
2427 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2428 SubDef->getOperand(1).setIsKill(false);
2429 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
2430 }
2431 RS.addImm(SubIdx);
2432 }
2433
2434 Op->setReg(Dst);
2435 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2436 Op->setReg(Reg);
2437 RS->eraseFromParent();
2438 return false;
2439 }
2440
2441 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2442
2443 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2444 // in which case we can erase them all later in runOnMachineFunction.
2445 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2446 MI.eraseFromParent();
2447 return true;
2448}
2449
2450/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2451/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2452static bool isAGPRCopy(const SIRegisterInfo &TRI,
2453 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2454 Register &OutReg, unsigned &OutSubReg) {
2455 assert(Copy.isCopy());
2456
2457 const MachineOperand &CopySrc = Copy.getOperand(1);
2458 Register CopySrcReg = CopySrc.getReg();
2459 if (!CopySrcReg.isVirtual())
2460 return false;
2461
2462 // Common case: copy from AGPR directly, e.g.
2463 // %1:vgpr_32 = COPY %0:agpr_32
2464 if (TRI.isAGPR(MRI, CopySrcReg)) {
2465 OutReg = CopySrcReg;
2466 OutSubReg = CopySrc.getSubReg();
2467 return true;
2468 }
2469
2470 // Sometimes it can also involve two copies, e.g.
2471 // %1:vgpr_256 = COPY %0:agpr_256
2472 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2473 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2474 if (!CopySrcDef || !CopySrcDef->isCopy())
2475 return false;
2476
2477 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2478 Register OtherCopySrcReg = OtherCopySrc.getReg();
2479 if (!OtherCopySrcReg.isVirtual() ||
2480 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2481 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2482 !TRI.isAGPR(MRI, OtherCopySrcReg))
2483 return false;
2484
2485 OutReg = OtherCopySrcReg;
2486 OutSubReg = CopySrc.getSubReg();
2487 return true;
2488}
2489
2490// Try to hoist an AGPR to VGPR copy across a PHI.
2491// This should allow folding of an AGPR into a consumer which may support it.
2492//
2493// Example 1: LCSSA PHI
2494// loop:
2495// %1:vreg = COPY %0:areg
2496// exit:
2497// %2:vreg = PHI %1:vreg, %loop
2498// =>
2499// loop:
2500// exit:
2501// %1:areg = PHI %0:areg, %loop
2502// %2:vreg = COPY %1:areg
2503//
2504// Example 2: PHI with multiple incoming values:
2505// entry:
2506// %1:vreg = GLOBAL_LOAD(..)
2507// loop:
2508// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2509// %3:areg = COPY %2:vreg
2510// %4:areg = (instr using %3:areg)
2511// %5:vreg = COPY %4:areg
2512// =>
2513// entry:
2514// %1:vreg = GLOBAL_LOAD(..)
2515// %2:areg = COPY %1:vreg
2516// loop:
2517// %3:areg = PHI %2:areg, %entry, %X:areg,
2518// %4:areg = (instr using %3:areg)
2519bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2520 assert(PHI.isPHI());
2521
2522 Register PhiOut = PHI.getOperand(0).getReg();
2523 if (!TRI->isVGPR(*MRI, PhiOut))
2524 return false;
2525
2526 // Iterate once over all incoming values of the PHI to check if this PHI is
2527 // eligible, and determine the exact AGPR RC we'll target.
2528 const TargetRegisterClass *ARC = nullptr;
2529 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2530 MachineOperand &MO = PHI.getOperand(K);
2531 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2532 if (!Copy || !Copy->isCopy())
2533 continue;
2534
2535 Register AGPRSrc;
2536 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2537 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2538 continue;
2539
2540 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2541 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2542 CopyInRC = SubRC;
2543
2544 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2545 return false;
2546 ARC = CopyInRC;
2547 }
2548
2549 if (!ARC)
2550 return false;
2551
2552 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2553
2554 // Rewrite the PHI's incoming values to ARC.
2555 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2556 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2557 MachineOperand &MO = PHI.getOperand(K);
2558 Register Reg = MO.getReg();
2559
2561 MachineBasicBlock *InsertMBB = nullptr;
2562
2563 // Look at the def of Reg, ignoring all copies.
2564 unsigned CopyOpc = AMDGPU::COPY;
2565 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2566
2567 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2568 // the copy was single-use, it will be removed by DCE later.
2569 if (Def->isCopy()) {
2570 Register AGPRSrc;
2571 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2572 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2573 MO.setReg(AGPRSrc);
2574 MO.setSubReg(AGPRSubReg);
2575 continue;
2576 }
2577
2578 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2579 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2580 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2581 // is unlikely to be profitable.
2582 //
2583 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2584 MachineOperand &CopyIn = Def->getOperand(1);
2585 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2586 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2587 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2588 }
2589
2590 InsertMBB = Def->getParent();
2591 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2592 } else {
2593 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2594 InsertPt = InsertMBB->getFirstTerminator();
2595 }
2596
2597 Register NewReg = MRI->createVirtualRegister(ARC);
2598 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2599 TII->get(CopyOpc), NewReg)
2600 .addReg(Reg);
2601 MO.setReg(NewReg);
2602
2603 (void)MI;
2604 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2605 }
2606
2607 // Replace the PHI's result with a new register.
2608 Register NewReg = MRI->createVirtualRegister(ARC);
2609 PHI.getOperand(0).setReg(NewReg);
2610
2611 // COPY that new register back to the original PhiOut register. This COPY will
2612 // usually be folded out later.
2613 MachineBasicBlock *MBB = PHI.getParent();
2614 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2615 TII->get(AMDGPU::COPY), PhiOut)
2616 .addReg(NewReg);
2617
2618 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2619 return true;
2620}
2621
2622// Attempt to convert VGPR load to an AGPR load.
2623bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2624 assert(MI.mayLoad());
2625 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2626 return false;
2627
2628 MachineOperand &Def = MI.getOperand(0);
2629 if (!Def.isDef())
2630 return false;
2631
2632 Register DefReg = Def.getReg();
2633
2634 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2635 return false;
2636
2638 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2639 SmallVector<Register, 8> MoveRegs;
2640
2641 if (Users.empty())
2642 return false;
2643
2644 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2645 while (!Users.empty()) {
2646 const MachineInstr *I = Users.pop_back_val();
2647 if (!I->isCopy() && !I->isRegSequence())
2648 return false;
2649 Register DstReg = I->getOperand(0).getReg();
2650 // Physical registers may have more than one instruction definitions
2651 if (DstReg.isPhysical())
2652 return false;
2653 if (TRI->isAGPR(*MRI, DstReg))
2654 continue;
2655 MoveRegs.push_back(DstReg);
2656 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2657 Users.push_back(&U);
2658 }
2659
2660 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2661 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2662 if (!TII->isOperandLegal(MI, 0, &Def)) {
2663 MRI->setRegClass(DefReg, RC);
2664 return false;
2665 }
2666
2667 while (!MoveRegs.empty()) {
2668 Register Reg = MoveRegs.pop_back_val();
2669 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2670 }
2671
2672 LLVM_DEBUG(dbgs() << "Folded " << MI);
2673
2674 return true;
2675}
2676
2677// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2678// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2679// there's cases where it can create a lot more AGPR-AGPR copies, which are
2680// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2681//
2682// This function looks at all AGPR PHIs in a basic block and collects their
2683// operands. Then, it checks for register that are used more than once across
2684// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2685// having to create one VGPR temporary per use, which can get very messy if
2686// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2687// element).
2688//
2689// Example
2690// a:
2691// %in:agpr_256 = COPY %foo:vgpr_256
2692// c:
2693// %x:agpr_32 = ..
2694// b:
2695// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2696// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2697// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2698// =>
2699// a:
2700// %in:agpr_256 = COPY %foo:vgpr_256
2701// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2702// %tmp_agpr:agpr_32 = COPY %tmp
2703// c:
2704// %x:agpr_32 = ..
2705// b:
2706// %0:areg = PHI %tmp_agpr, %a, %x, %c
2707// %1:areg = PHI %tmp_agpr, %a, %y, %c
2708// %2:areg = PHI %tmp_agpr, %a, %z, %c
2709bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2710 // This is only really needed on GFX908 where AGPR-AGPR copies are
2711 // unreasonably difficult.
2712 if (ST->hasGFX90AInsts())
2713 return false;
2714
2715 // Look at all AGPR Phis and collect the register + subregister used.
2716 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2717 RegToMO;
2718
2719 for (auto &MI : MBB) {
2720 if (!MI.isPHI())
2721 break;
2722
2723 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2724 continue;
2725
2726 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2727 MachineOperand &PhiMO = MI.getOperand(K);
2728 if (!PhiMO.getSubReg())
2729 continue;
2730 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2731 }
2732 }
2733
2734 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2735 // a VGPR.
2736 bool Changed = false;
2737 for (const auto &[Entry, MOs] : RegToMO) {
2738 if (MOs.size() == 1)
2739 continue;
2740
2741 const auto [Reg, SubReg] = Entry;
2742 MachineInstr *Def = MRI->getVRegDef(Reg);
2743 MachineBasicBlock *DefMBB = Def->getParent();
2744
2745 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2746 // out.
2747 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2748 Register TempVGPR =
2749 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2750 MachineInstr *VGPRCopy =
2751 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2752 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2753 .addReg(Reg, /* flags */ 0, SubReg);
2754
2755 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2756 Register TempAGPR = MRI->createVirtualRegister(ARC);
2757 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2758 TII->get(AMDGPU::COPY), TempAGPR)
2759 .addReg(TempVGPR);
2760
2761 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2762 for (MachineOperand *MO : MOs) {
2763 MO->setReg(TempAGPR);
2764 MO->setSubReg(AMDGPU::NoSubRegister);
2765 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2766 }
2767
2768 Changed = true;
2769 }
2770
2771 return Changed;
2772}
2773
2774bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2775 this->MF = &MF;
2776 MRI = &MF.getRegInfo();
2777 ST = &MF.getSubtarget<GCNSubtarget>();
2778 TII = ST->getInstrInfo();
2779 TRI = &TII->getRegisterInfo();
2780 MFI = MF.getInfo<SIMachineFunctionInfo>();
2781
2782 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2783 // correctly handle signed zeros.
2784 //
2785 // FIXME: Also need to check strictfp
2786 bool IsIEEEMode = MFI->getMode().IEEE;
2787 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2788
2789 bool Changed = false;
2790 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2791 MachineOperand *CurrentKnownM0Val = nullptr;
2792 for (auto &MI : make_early_inc_range(*MBB)) {
2793 Changed |= tryFoldCndMask(MI);
2794
2795 if (tryFoldZeroHighBits(MI)) {
2796 Changed = true;
2797 continue;
2798 }
2799
2800 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2801 Changed = true;
2802 continue;
2803 }
2804
2805 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2806 Changed = true;
2807 continue;
2808 }
2809
2810 if (MI.mayLoad() && tryFoldLoad(MI)) {
2811 Changed = true;
2812 continue;
2813 }
2814
2815 if (TII->isFoldableCopy(MI)) {
2816 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2817 continue;
2818 }
2819
2820 // Saw an unknown clobber of m0, so we no longer know what it is.
2821 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2822 CurrentKnownM0Val = nullptr;
2823
2824 // TODO: Omod might be OK if there is NSZ only on the source
2825 // instruction, and not the omod multiply.
2826 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2827 !tryFoldOMod(MI))
2828 Changed |= tryFoldClamp(MI);
2829 }
2830
2831 Changed |= tryOptimizeAGPRPhis(*MBB);
2832 }
2833
2834 return Changed;
2835}
2836
2839 MFPropsModifier _(*this, MF);
2840
2841 bool Changed = SIFoldOperandsImpl().run(MF);
2842 if (!Changed) {
2843 return PreservedAnalyses::all();
2844 }
2846 PA.preserveSet<CFGAnalyses>();
2847 return PA;
2848}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:134
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:169
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.