LLVM 22.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
246 bool tryConstantFoldOp(MachineInstr *MI) const;
247 bool tryFoldCndMask(MachineInstr &MI) const;
248 bool tryFoldZeroHighBits(MachineInstr &MI) const;
249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
250
251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
252 bool tryFoldFoldableCopy(MachineInstr &MI,
253 MachineOperand *&CurrentKnownM0Val) const;
254
255 const MachineOperand *isClamp(const MachineInstr &MI) const;
256 bool tryFoldClamp(MachineInstr &MI);
257
258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
259 bool tryFoldOMod(MachineInstr &MI);
260 bool tryFoldRegSequence(MachineInstr &MI);
261 bool tryFoldPhiAGPR(MachineInstr &MI);
262 bool tryFoldLoad(MachineInstr &MI);
263
264 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
265
266public:
267 SIFoldOperandsImpl() = default;
268
269 bool run(MachineFunction &MF);
270};
271
272class SIFoldOperandsLegacy : public MachineFunctionPass {
273public:
274 static char ID;
275
276 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
277
278 bool runOnMachineFunction(MachineFunction &MF) override {
279 if (skipFunction(MF.getFunction()))
280 return false;
281 return SIFoldOperandsImpl().run(MF);
282 }
283
284 StringRef getPassName() const override { return "SI Fold Operands"; }
285
286 void getAnalysisUsage(AnalysisUsage &AU) const override {
287 AU.setPreservesCFG();
289 }
290
291 MachineFunctionProperties getRequiredProperties() const override {
292 return MachineFunctionProperties().setIsSSA();
293 }
294};
295
296} // End anonymous namespace.
297
298INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
299 false)
300
301char SIFoldOperandsLegacy::ID = 0;
302
303char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
304
307 const MachineOperand &MO) {
308 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
309 if (const TargetRegisterClass *SubRC =
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
311 RC = SubRC;
312 return RC;
313}
314
315// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
316static unsigned macToMad(unsigned Opc) {
317 switch (Opc) {
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
334 }
335 return AMDGPU::INSTRUCTION_LIST_END;
336}
337
338// TODO: Add heuristic that the frame index might not fit in the addressing mode
339// immediate offset to avoid materializing in loops.
340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
341 const FoldableDef &OpToFold) const {
342 if (!OpToFold.isFI())
343 return false;
344
345 const unsigned Opc = UseMI.getOpcode();
346 switch (Opc) {
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
351 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
352 // to insert the wave size shift at every point we use the index.
353 // TODO: Fix depending on visit order to fold immediates into the operand
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
360 default:
361 break;
362 }
363
364 if (TII->isMUBUF(UseMI))
365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
366 if (!TII->isFLATScratch(UseMI))
367 return false;
368
369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
370 if (OpNo == SIdx)
371 return true;
372
373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
375}
376
377/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
378///
379/// => %vgpr = V_ADD_U32 x, frameindex
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
381 Register DstReg, Register SrcReg, MachineInstr &MI) const {
382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *Def = MRI->getVRegDef(SrcReg);
385 if (!Def || Def->getNumOperands() != 4)
386 return false;
387
388 MachineOperand *Src0 = &Def->getOperand(1);
389 MachineOperand *Src1 = &Def->getOperand(2);
390
391 // TODO: This is profitable with more operand types, and for more
392 // opcodes. But ultimately this is working around poor / nonexistent
393 // regbankselect.
394 if (!Src0->isFI() && !Src1->isFI())
395 return false;
396
397 if (Src0->isFI())
398 std::swap(Src0, Src1);
399
400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !Def->getOperand(3).isDead()) // Check if scc is dead
404 return false;
405
406 MachineBasicBlock *MBB = Def->getParent();
407 const DebugLoc &DL = Def->getDebugLoc();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder Add =
410 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
411
412 if (Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
414 Add.addDef(CarryOutReg, RegState::Dead);
415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
416 }
417
418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
419 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
420 Add.addImm(0);
421
422 Def->eraseFromParent();
423 MI.eraseFromParent();
424 return true;
425 }
426
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
428
430 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
431 if (Liveness == MachineBasicBlock::LQR_Dead) {
432 // TODO: If src1 satisfies operand constraints, use vop3 version.
433 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
434 .add(*Src0)
435 .add(*Src1)
436 .setOperandDead(3) // implicit-def $vcc
437 .setMIFlags(Def->getFlags());
438 Def->eraseFromParent();
439 MI.eraseFromParent();
440 return true;
441 }
442 }
443
444 return false;
445}
446
448 return new SIFoldOperandsLegacy();
449}
450
451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
452 unsigned UseOpNo,
453 int64_t ImmVal) const {
454 const uint64_t TSFlags = MI->getDesc().TSFlags;
455
456 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
457 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
458 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
459 return false;
460
461 const MachineOperand &Old = MI->getOperand(UseOpNo);
462 int OpNo = MI->getOperandNo(&Old);
463
464 unsigned Opcode = MI->getOpcode();
465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
466 switch (OpType) {
467 default:
468 return false;
476 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
477 // two different constants.
478 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
480 return false;
481 break;
482 }
483
484 return true;
485}
486
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
488 int64_t ImmVal) const {
489 MachineOperand &Old = MI->getOperand(UseOpNo);
490 unsigned Opcode = MI->getOpcode();
491 int OpNo = MI->getOperandNo(&Old);
492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
493
494 // If the literal can be inlined as-is, apply it and short-circuit the
495 // tests below. The main motivation for this is to avoid unintuitive
496 // uses of opsel.
497 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
498 Old.ChangeToImmediate(ImmVal);
499 return true;
500 }
501
502 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
503 // op_sel in a way that allows an inline constant.
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
508 SrcIdx = 0;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
511 SrcIdx = 1;
512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
514 SrcIdx = 2;
515 }
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &Mod = MI->getOperand(ModIdx);
519 unsigned ModVal = Mod.getImm();
520
521 uint16_t ImmLo =
522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
523 uint16_t ImmHi =
524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
526 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
527
528 // Helper function that attempts to inline the given value with a newly
529 // chosen opsel pattern.
530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
531 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
532 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
533 Old.ChangeToImmediate(Imm);
534 return true;
535 }
536
537 // Try to shuffle the halves around and leverage opsel to get an inline
538 // constant.
539 uint16_t Lo = static_cast<uint16_t>(Imm);
540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
541 if (Lo == Hi) {
542 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
543 Mod.setImm(NewModVal);
545 return true;
546 }
547
548 if (static_cast<int16_t>(Lo) < 0) {
549 int32_t SExt = static_cast<int16_t>(Lo);
550 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
551 Mod.setImm(NewModVal);
552 Old.ChangeToImmediate(SExt);
553 return true;
554 }
555 }
556
557 // This check is only useful for integer instructions
558 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
559 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
560 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
561 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
562 return true;
563 }
564 }
565 } else {
566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
567 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
568 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
569 Old.ChangeToImmediate(Swapped);
570 return true;
571 }
572 }
573
574 return false;
575 };
576
577 if (tryFoldToInline(Imm))
578 return true;
579
580 // Replace integer addition by subtraction and vice versa if it allows
581 // folding the immediate to an inline constant.
582 //
583 // We should only ever get here for SrcIdx == 1 due to canonicalization
584 // earlier in the pipeline, but we double-check here to be safe / fully
585 // general.
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 unsigned ClampIdx =
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
592
593 if (!Clamp) {
594 uint16_t NegLo = -static_cast<uint16_t>(Imm);
595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
597
598 if (tryFoldToInline(NegImm)) {
599 unsigned NegOpcode =
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(TII->get(NegOpcode));
602 return true;
603 }
604 }
605 }
606
607 return false;
608}
609
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
611 MachineInstr *MI = Fold.UseMI;
612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
613 assert(Old.isReg());
614
615 std::optional<int64_t> ImmVal;
616 if (Fold.isImm())
617 ImmVal = Fold.Def.getEffectiveImmVal();
618
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
621 return true;
622
623 // We can't represent the candidate as an inline constant. Try as a literal
624 // with the original opsel, checking constant bus limitations.
625 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
626 int OpNo = MI->getOperandNo(&Old);
627 if (!TII->isOperandLegal(*MI, OpNo, &New))
628 return false;
629 Old.ChangeToImmediate(*ImmVal);
630 return true;
631 }
632
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *MBB = MI->getParent();
635 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
636 if (Liveness != MachineBasicBlock::LQR_Dead) {
637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
638 return false;
639 }
640
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 = MI->getOperand(0);
643 MachineOperand &Dst1 = MI->getOperand(1);
644 assert(Dst0.isDef() && Dst1.isDef());
645
646 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
647
648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
650
651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
652
653 if (HaveNonDbgCarryUse) {
654 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
655 Dst1.getReg())
656 .addReg(AMDGPU::VCC, RegState::Kill);
657 }
658
659 // Keep the old instruction around to avoid breaking iterators, but
660 // replace it with a dummy instruction to remove uses.
661 //
662 // FIXME: We should not invert how this pass looks at operands to avoid
663 // this. Should track set of foldable movs instead of looking for uses
664 // when looking at a use.
665 Dst0.setReg(NewReg0);
666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
667 MI->removeOperand(I);
668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
669
670 if (Fold.Commuted)
671 TII->commuteInstruction(*Inst32, false);
672 return true;
673 }
674
675 assert(!Fold.needsShrink() && "not handled");
676
677 if (ImmVal) {
678 if (Old.isTied()) {
679 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
680 if (NewMFMAOpc == -1)
681 return false;
682 MI->setDesc(TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 }
685
686 // TODO: Should we try to avoid adding this to the candidate list?
687 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
688 int OpNo = MI->getOperandNo(&Old);
689 if (!TII->isOperandLegal(*MI, OpNo, &New))
690 return false;
691
692 Old.ChangeToImmediate(*ImmVal);
693 return true;
694 }
695
696 if (Fold.isGlobal()) {
697 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
698 Fold.Def.OpToFold->getOffset(),
699 Fold.Def.OpToFold->getTargetFlags());
700 return true;
701 }
702
703 if (Fold.isFI()) {
704 Old.ChangeToFrameIndex(Fold.getFI());
705 return true;
706 }
707
708 MachineOperand *New = Fold.Def.OpToFold;
709
710 // Verify the register is compatible with the operand.
711 if (const TargetRegisterClass *OpRC =
712 TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
713 const TargetRegisterClass *NewRC =
714 TRI->getRegClassForReg(*MRI, New->getReg());
715 const TargetRegisterClass *ConstrainRC =
716 TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg());
717 if (!ConstrainRC)
718 return false;
719
720 if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
721 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
722 << TRI->getRegClassName(ConstrainRC) << '\n');
723 return false;
724 }
725 }
726
727 // Rework once the VS_16 register class is updated to include proper
728 // 16-bit SGPRs instead of 32-bit ones.
729 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
730 Old.setSubReg(AMDGPU::NoSubRegister);
731 if (New->getReg().isPhysical()) {
732 Old.substPhysReg(New->getReg(), *TRI);
733 } else {
734 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
735 Old.setIsUndef(New->isUndef());
736 }
737 return true;
738}
739
741 FoldCandidate &&Entry) {
742 // Skip additional folding on the same operand.
743 for (FoldCandidate &Fold : FoldList)
744 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
745 return;
746 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
747 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
748 FoldList.push_back(Entry);
749}
750
752 MachineInstr *MI, unsigned OpNo,
753 const FoldableDef &FoldOp,
754 bool Commuted = false, int ShrinkOp = -1) {
755 appendFoldCandidate(FoldList,
756 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
757}
758
759bool SIFoldOperandsImpl::tryAddToFoldList(
760 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
761 const FoldableDef &OpToFold) const {
762 const unsigned Opc = MI->getOpcode();
763
764 auto tryToFoldAsFMAAKorMK = [&]() {
765 if (!OpToFold.isImm())
766 return false;
767
768 const bool TryAK = OpNo == 3;
769 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
770 MI->setDesc(TII->get(NewOpc));
771
772 // We have to fold into operand which would be Imm not into OpNo.
773 bool FoldAsFMAAKorMK =
774 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
775 if (FoldAsFMAAKorMK) {
776 // Untie Src2 of fmac.
777 MI->untieRegOperand(3);
778 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
779 if (OpNo == 1) {
780 MachineOperand &Op1 = MI->getOperand(1);
781 MachineOperand &Op2 = MI->getOperand(2);
782 Register OldReg = Op1.getReg();
783 // Operand 2 might be an inlinable constant
784 if (Op2.isImm()) {
785 Op1.ChangeToImmediate(Op2.getImm());
786 Op2.ChangeToRegister(OldReg, false);
787 } else {
788 Op1.setReg(Op2.getReg());
789 Op2.setReg(OldReg);
790 }
791 }
792 return true;
793 }
794 MI->setDesc(TII->get(Opc));
795 return false;
796 };
797
798 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
799 if (!IsLegal && OpToFold.isImm()) {
800 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
801 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
802 }
803
804 if (!IsLegal) {
805 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
806 unsigned NewOpc = macToMad(Opc);
807 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
808 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
809 // to fold the operand.
810 MI->setDesc(TII->get(NewOpc));
811 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
812 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
813 if (AddOpSel)
814 MI->addOperand(MachineOperand::CreateImm(0));
815 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
816 if (FoldAsMAD) {
817 MI->untieRegOperand(OpNo);
818 return true;
819 }
820 if (AddOpSel)
821 MI->removeOperand(MI->getNumExplicitOperands() - 1);
822 MI->setDesc(TII->get(Opc));
823 }
824
825 // Special case for s_fmac_f32 if we are trying to fold into Src2.
826 // By transforming into fmaak we can untie Src2 and make folding legal.
827 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
828 if (tryToFoldAsFMAAKorMK())
829 return true;
830 }
831
832 // Special case for s_setreg_b32
833 if (OpToFold.isImm()) {
834 unsigned ImmOpc = 0;
835 if (Opc == AMDGPU::S_SETREG_B32)
836 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
837 else if (Opc == AMDGPU::S_SETREG_B32_mode)
838 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
839 if (ImmOpc) {
840 MI->setDesc(TII->get(ImmOpc));
841 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
842 return true;
843 }
844 }
845
846 // Operand is not legal, so try to commute the instruction to
847 // see if this makes it possible to fold.
848 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
849 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
850 if (!CanCommute)
851 return false;
852
853 MachineOperand &Op = MI->getOperand(OpNo);
854 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
855
856 // One of operands might be an Imm operand, and OpNo may refer to it after
857 // the call of commuteInstruction() below. Such situations are avoided
858 // here explicitly as OpNo must be a register operand to be a candidate
859 // for memory folding.
860 if (!Op.isReg() || !CommutedOp.isReg())
861 return false;
862
863 // The same situation with an immediate could reproduce if both inputs are
864 // the same register.
865 if (Op.isReg() && CommutedOp.isReg() &&
866 (Op.getReg() == CommutedOp.getReg() &&
867 Op.getSubReg() == CommutedOp.getSubReg()))
868 return false;
869
870 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
871 return false;
872
873 int Op32 = -1;
874 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
875 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
876 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
877 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
878 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
879 return false;
880 }
881
882 // Verify the other operand is a VGPR, otherwise we would violate the
883 // constant bus restriction.
884 MachineOperand &OtherOp = MI->getOperand(OpNo);
885 if (!OtherOp.isReg() ||
886 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
887 return false;
888
889 assert(MI->getOperand(1).isDef());
890
891 // Make sure to get the 32-bit version of the commuted opcode.
892 unsigned MaybeCommutedOpc = MI->getOpcode();
893 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
894 }
895
896 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
897 Op32);
898 return true;
899 }
900
901 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
902 // By changing into fmamk we can untie Src2.
903 // If folding for Src0 happens first and it is identical operand to Src1 we
904 // should avoid transforming into fmamk which requires commuting as it would
905 // cause folding into Src1 to fail later on due to wrong OpNo used.
906 if (Opc == AMDGPU::S_FMAC_F32 &&
907 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
908 if (tryToFoldAsFMAAKorMK())
909 return true;
910 }
911
912 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
913 return true;
914}
915
916bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
917 const MachineOperand &UseMO) const {
918 // Operands of SDWA instructions must be registers.
919 return !TII->isSDWA(MI);
920}
921
924 Register SrcReg) {
925 MachineOperand *Sub = nullptr;
926 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
927 SubDef && TII.isFoldableCopy(*SubDef);
928 SubDef = MRI.getVRegDef(Sub->getReg())) {
929 MachineOperand &SrcOp = SubDef->getOperand(1);
930 if (SrcOp.isImm())
931 return &SrcOp;
932 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
933 break;
934 Sub = &SrcOp;
935 // TODO: Support compose
936 if (SrcOp.getSubReg())
937 break;
938 }
939
940 return Sub;
941}
942
943const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
944 MachineInstr &RegSeq,
945 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
946
947 assert(RegSeq.isRegSequence());
948
949 const TargetRegisterClass *RC = nullptr;
950
951 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
952 MachineOperand &SrcOp = RegSeq.getOperand(I);
953 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
954
955 // Only accept reg_sequence with uniform reg class inputs for simplicity.
956 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
957 if (!RC)
958 RC = OpRC;
959 else if (!TRI->getCommonSubClass(RC, OpRC))
960 return nullptr;
961
962 if (SrcOp.getSubReg()) {
963 // TODO: Handle subregister compose
964 Defs.emplace_back(&SrcOp, SubRegIdx);
965 continue;
966 }
967
968 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
969 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
970 Defs.emplace_back(DefSrc, SubRegIdx);
971 continue;
972 }
973
974 Defs.emplace_back(&SrcOp, SubRegIdx);
975 }
976
977 return RC;
978}
979
980// Find a def of the UseReg, check if it is a reg_sequence and find initializers
981// for each subreg, tracking it to an immediate if possible. Returns the
982// register class of the inputs on success.
983const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
984 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
985 Register UseReg) const {
986 MachineInstr *Def = MRI->getVRegDef(UseReg);
987 if (!Def || !Def->isRegSequence())
988 return nullptr;
989
990 return getRegSeqInit(*Def, Defs);
991}
992
993std::pair<int64_t, const TargetRegisterClass *>
994SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
996 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
997 if (!SrcRC)
998 return {};
999
1000 bool TryToMatchSplat64 = false;
1001
1002 int64_t Imm;
1003 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1004 const MachineOperand *Op = Defs[I].first;
1005 if (!Op->isImm())
1006 return {};
1007
1008 int64_t SubImm = Op->getImm();
1009 if (!I) {
1010 Imm = SubImm;
1011 continue;
1012 }
1013
1014 if (Imm != SubImm) {
1015 if (I == 1 && (E & 1) == 0) {
1016 // If we have an even number of inputs, there's a chance this is a
1017 // 64-bit element splat broken into 32-bit pieces.
1018 TryToMatchSplat64 = true;
1019 break;
1020 }
1021
1022 return {}; // Can only fold splat constants
1023 }
1024 }
1025
1026 if (!TryToMatchSplat64)
1027 return {Defs[0].first->getImm(), SrcRC};
1028
1029 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1030 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1031 int64_t SplatVal64;
1032 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1033 const MachineOperand *Op0 = Defs[I].first;
1034 const MachineOperand *Op1 = Defs[I + 1].first;
1035
1036 if (!Op0->isImm() || !Op1->isImm())
1037 return {};
1038
1039 unsigned SubReg0 = Defs[I].second;
1040 unsigned SubReg1 = Defs[I + 1].second;
1041
1042 // Assume we're going to generally encounter reg_sequences with sorted
1043 // subreg indexes, so reject any that aren't consecutive.
1044 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1045 TRI->getChannelFromSubReg(SubReg1))
1046 return {};
1047
1048 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1049 if (I == 0)
1050 SplatVal64 = MergedVal;
1051 else if (SplatVal64 != MergedVal)
1052 return {};
1053 }
1054
1055 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1056 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1057
1058 return {SplatVal64, RC64};
1059}
1060
1061bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1062 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1063 const TargetRegisterClass *SplatRC) const {
1064 const MCInstrDesc &Desc = UseMI->getDesc();
1065 if (UseOpIdx >= Desc.getNumOperands())
1066 return false;
1067
1068 // Filter out unhandled pseudos.
1069 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1070 return false;
1071
1072 int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
1073 if (RCID == -1)
1074 return false;
1075
1076 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1077
1078 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1079 // have the same bits. These are the only cases where a splat has the same
1080 // interpretation for 32-bit and 64-bit splats.
1081 if (SplatVal != 0 && SplatVal != -1) {
1082 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1083 // operand will be AReg_128, and we want to check if it's compatible with an
1084 // AReg_32 constant.
1085 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1086 switch (OpTy) {
1091 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1092 break;
1096 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1097 break;
1098 default:
1099 return false;
1100 }
1101
1102 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1103 return false;
1104 }
1105
1106 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1107 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1108 return false;
1109
1110 return true;
1111}
1112
1113bool SIFoldOperandsImpl::tryToFoldACImm(
1114 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1115 SmallVectorImpl<FoldCandidate> &FoldList) const {
1116 const MCInstrDesc &Desc = UseMI->getDesc();
1117 if (UseOpIdx >= Desc.getNumOperands())
1118 return false;
1119
1120 // Filter out unhandled pseudos.
1121 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1122 return false;
1123
1124 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
1125 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1126 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1127 return true;
1128 }
1129
1130 // TODO: Verify the following code handles subregisters correctly.
1131 // TODO: Handle extract of global reference
1132 if (UseOp.getSubReg())
1133 return false;
1134
1135 if (!OpToFold.isReg())
1136 return false;
1137
1138 Register UseReg = OpToFold.getReg();
1139 if (!UseReg.isVirtual())
1140 return false;
1141
1142 // Maybe it is just a COPY of an immediate itself.
1143
1144 // FIXME: Remove this handling. There is already special case folding of
1145 // immediate into copy in foldOperand. This is looking for the def of the
1146 // value the folding started from in the first place.
1147 MachineInstr *Def = MRI->getVRegDef(UseReg);
1148 if (Def && TII->isFoldableCopy(*Def)) {
1149 MachineOperand &DefOp = Def->getOperand(1);
1150 if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
1151 FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
1152 OpToFold.DefSubReg);
1153 appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
1154 return true;
1155 }
1156 }
1157
1158 return false;
1159}
1160
1161void SIFoldOperandsImpl::foldOperand(
1162 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1163 SmallVectorImpl<FoldCandidate> &FoldList,
1164 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1165 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1166
1167 if (!isUseSafeToFold(*UseMI, *UseOp))
1168 return;
1169
1170 // FIXME: Fold operands with subregs.
1171 if (UseOp->isReg() && OpToFold.isReg()) {
1172 if (UseOp->isImplicit())
1173 return;
1174 // Allow folding from SGPRs to 16-bit VGPRs.
1175 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1176 (UseOp->getSubReg() != AMDGPU::lo16 ||
1177 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1178 return;
1179 }
1180
1181 // Special case for REG_SEQUENCE: We can't fold literals into
1182 // REG_SEQUENCE instructions, so we have to fold them into the
1183 // uses of REG_SEQUENCE.
1184 if (UseMI->isRegSequence()) {
1185 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1186 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1187
1188 int64_t SplatVal;
1189 const TargetRegisterClass *SplatRC;
1190 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1191
1192 // Grab the use operands first
1194 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1195 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1196 MachineOperand *RSUse = UsesToProcess[I];
1197 MachineInstr *RSUseMI = RSUse->getParent();
1198 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1199
1200 if (SplatRC) {
1201 if (RSUseMI->isCopy()) {
1202 Register DstReg = RSUseMI->getOperand(0).getReg();
1203 append_range(UsesToProcess,
1204 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1205 continue;
1206 }
1207 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1208 FoldableDef SplatDef(SplatVal, SplatRC);
1209 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1210 continue;
1211 }
1212 }
1213
1214 // TODO: Handle general compose
1215 if (RSUse->getSubReg() != RegSeqDstSubReg)
1216 continue;
1217
1218 // FIXME: We should avoid recursing here. There should be a cleaner split
1219 // between the in-place mutations and adding to the fold list.
1220 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1221 CopiesToReplace);
1222 }
1223
1224 return;
1225 }
1226
1227 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1228 return;
1229
1230 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1231 // Verify that this is a stack access.
1232 // FIXME: Should probably use stack pseudos before frame lowering.
1233
1234 if (TII->isMUBUF(*UseMI)) {
1235 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1236 MFI->getScratchRSrcReg())
1237 return;
1238
1239 // Ensure this is either relative to the current frame or the current
1240 // wave.
1241 MachineOperand &SOff =
1242 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1243 if (!SOff.isImm() || SOff.getImm() != 0)
1244 return;
1245 }
1246
1247 const unsigned Opc = UseMI->getOpcode();
1248 if (TII->isFLATScratch(*UseMI) &&
1249 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1250 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1251 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1252 unsigned CPol =
1253 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1254 if ((CPol & AMDGPU::CPol::SCAL) &&
1256 return;
1257
1258 UseMI->setDesc(TII->get(NewOpc));
1259 }
1260
1261 // A frame index will resolve to a positive constant, so it should always be
1262 // safe to fold the addressing mode, even pre-GFX9.
1263 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1264
1265 return;
1266 }
1267
1268 bool FoldingImmLike =
1269 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1270
1271 if (FoldingImmLike && UseMI->isCopy()) {
1272 Register DestReg = UseMI->getOperand(0).getReg();
1273 Register SrcReg = UseMI->getOperand(1).getReg();
1274 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1275 assert(SrcReg.isVirtual());
1276
1277 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1278
1279 // Don't fold into a copy to a physical register with the same class. Doing
1280 // so would interfere with the register coalescer's logic which would avoid
1281 // redundant initializations.
1282 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1283 return;
1284
1285 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1286 // In order to fold immediates into copies, we need to change the copy to a
1287 // MOV. Find a compatible mov instruction with the value.
1288 for (unsigned MovOp :
1289 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1290 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1291 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1292 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1293 const MCInstrDesc &MovDesc = TII->get(MovOp);
1294 assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
1295
1296 const TargetRegisterClass *MovDstRC =
1297 TRI->getRegClass(MovDesc.operands()[0].RegClass);
1298
1299 // Fold if the destination register class of the MOV instruction (ResRC)
1300 // is a superclass of (or equal to) the destination register class of the
1301 // COPY (DestRC). If this condition fails, folding would be illegal.
1302 if (!DestRC->hasSuperClassEq(MovDstRC))
1303 continue;
1304
1305 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1306 const TargetRegisterClass *MovSrcRC =
1307 TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
1308 if (MovSrcRC) {
1309 if (UseSubReg)
1310 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1311 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1312 break;
1313
1314 // FIXME: This is mutating the instruction only and deferring the actual
1315 // fold of the immediate
1316 } else {
1317 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1318 // immediate to verify. Technically we should always verify this, but it
1319 // only matters for these concrete cases.
1320 // TODO: Handle non-imm case if it's useful.
1321 if (!OpToFold.isImm() ||
1322 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1323 break;
1324 }
1325
1328 while (ImpOpI != ImpOpE) {
1329 MachineInstr::mop_iterator Tmp = ImpOpI;
1330 ImpOpI++;
1332 }
1333 UseMI->setDesc(MovDesc);
1334
1335 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1336 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1337 MachineOperand NewSrcOp(SrcOp);
1338 MachineFunction *MF = UseMI->getParent()->getParent();
1339 UseMI->removeOperand(1);
1340 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1341 UseMI->addOperand(NewSrcOp); // src0
1342 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1343 UseOpIdx = SrcIdx;
1344 UseOp = &UseMI->getOperand(UseOpIdx);
1345 }
1346 CopiesToReplace.push_back(UseMI);
1347 break;
1348 }
1349
1350 // We failed to replace the copy, so give up.
1351 if (UseMI->getOpcode() == AMDGPU::COPY)
1352 return;
1353
1354 } else {
1355 if (UseMI->isCopy() && OpToFold.isReg() &&
1356 UseMI->getOperand(0).getReg().isVirtual() &&
1357 !UseMI->getOperand(1).getSubReg() &&
1358 OpToFold.DefMI->implicit_operands().empty()) {
1359 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1360 << *UseMI);
1361 unsigned Size = TII->getOpSize(*UseMI, 1);
1362 Register UseReg = OpToFold.getReg();
1364 unsigned SubRegIdx = OpToFold.getSubReg();
1365 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1366 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1367 // VS_16RegClass
1368 //
1369 // Excerpt from AMDGPUGenRegisterInfo.inc
1370 // NoSubRegister, //0
1371 // hi16, // 1
1372 // lo16, // 2
1373 // sub0, // 3
1374 // ...
1375 // sub1, // 11
1376 // sub1_hi16, // 12
1377 // sub1_lo16, // 13
1378 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1379 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1380 TRI->isSGPRReg(*MRI, UseReg)) {
1381 // Produce the 32 bit subregister index to which the 16-bit subregister
1382 // is aligned.
1383 if (SubRegIdx > AMDGPU::sub1) {
1384 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1385 M |= M.getLane(M.getHighestLane() - 1);
1386 SmallVector<unsigned, 4> Indexes;
1387 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1388 Indexes);
1389 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1390 SubRegIdx = Indexes[0];
1391 // 32-bit registers do not have a sub0 index
1392 } else if (TII->getOpSize(*UseMI, 1) == 4)
1393 SubRegIdx = 0;
1394 else
1395 SubRegIdx = AMDGPU::sub0;
1396 }
1397 UseMI->getOperand(1).setSubReg(SubRegIdx);
1398 UseMI->getOperand(1).setIsKill(false);
1399 CopiesToReplace.push_back(UseMI);
1400 OpToFold.OpToFold->setIsKill(false);
1401
1402 // Remove kill flags as kills may now be out of order with uses.
1403 MRI->clearKillFlags(UseReg);
1404 if (foldCopyToAGPRRegSequence(UseMI))
1405 return;
1406 }
1407
1408 unsigned UseOpc = UseMI->getOpcode();
1409 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1410 (UseOpc == AMDGPU::V_READLANE_B32 &&
1411 (int)UseOpIdx ==
1412 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1413 // %vgpr = V_MOV_B32 imm
1414 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1415 // =>
1416 // %sgpr = S_MOV_B32 imm
1417 if (FoldingImmLike) {
1419 UseMI->getOperand(UseOpIdx).getReg(),
1420 *OpToFold.DefMI, *UseMI))
1421 return;
1422
1423 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1424
1425 if (OpToFold.isImm()) {
1427 *OpToFold.getEffectiveImmVal());
1428 } else if (OpToFold.isFI())
1429 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1430 else {
1431 assert(OpToFold.isGlobal());
1432 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1433 OpToFold.OpToFold->getOffset(),
1434 OpToFold.OpToFold->getTargetFlags());
1435 }
1436 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1437 return;
1438 }
1439
1440 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1442 UseMI->getOperand(UseOpIdx).getReg(),
1443 *OpToFold.DefMI, *UseMI))
1444 return;
1445
1446 // %vgpr = COPY %sgpr0
1447 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1448 // =>
1449 // %sgpr1 = COPY %sgpr0
1450 UseMI->setDesc(TII->get(AMDGPU::COPY));
1451 UseMI->getOperand(1).setReg(OpToFold.getReg());
1452 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1453 UseMI->getOperand(1).setIsKill(false);
1454 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1455 return;
1456 }
1457 }
1458
1459 const MCInstrDesc &UseDesc = UseMI->getDesc();
1460
1461 // Don't fold into target independent nodes. Target independent opcodes
1462 // don't have defined register classes.
1463 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1464 UseDesc.operands()[UseOpIdx].RegClass == -1)
1465 return;
1466 }
1467
1468 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1469 // to enable more folding opportunities. The shrink operands pass
1470 // already does this.
1471
1472 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1473}
1474
1475static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1477 switch (Opcode) {
1478 case AMDGPU::V_AND_B32_e64:
1479 case AMDGPU::V_AND_B32_e32:
1480 case AMDGPU::S_AND_B32:
1481 Result = LHS & RHS;
1482 return true;
1483 case AMDGPU::V_OR_B32_e64:
1484 case AMDGPU::V_OR_B32_e32:
1485 case AMDGPU::S_OR_B32:
1486 Result = LHS | RHS;
1487 return true;
1488 case AMDGPU::V_XOR_B32_e64:
1489 case AMDGPU::V_XOR_B32_e32:
1490 case AMDGPU::S_XOR_B32:
1491 Result = LHS ^ RHS;
1492 return true;
1493 case AMDGPU::S_XNOR_B32:
1494 Result = ~(LHS ^ RHS);
1495 return true;
1496 case AMDGPU::S_NAND_B32:
1497 Result = ~(LHS & RHS);
1498 return true;
1499 case AMDGPU::S_NOR_B32:
1500 Result = ~(LHS | RHS);
1501 return true;
1502 case AMDGPU::S_ANDN2_B32:
1503 Result = LHS & ~RHS;
1504 return true;
1505 case AMDGPU::S_ORN2_B32:
1506 Result = LHS | ~RHS;
1507 return true;
1508 case AMDGPU::V_LSHL_B32_e64:
1509 case AMDGPU::V_LSHL_B32_e32:
1510 case AMDGPU::S_LSHL_B32:
1511 // The instruction ignores the high bits for out of bounds shifts.
1512 Result = LHS << (RHS & 31);
1513 return true;
1514 case AMDGPU::V_LSHLREV_B32_e64:
1515 case AMDGPU::V_LSHLREV_B32_e32:
1516 Result = RHS << (LHS & 31);
1517 return true;
1518 case AMDGPU::V_LSHR_B32_e64:
1519 case AMDGPU::V_LSHR_B32_e32:
1520 case AMDGPU::S_LSHR_B32:
1521 Result = LHS >> (RHS & 31);
1522 return true;
1523 case AMDGPU::V_LSHRREV_B32_e64:
1524 case AMDGPU::V_LSHRREV_B32_e32:
1525 Result = RHS >> (LHS & 31);
1526 return true;
1527 case AMDGPU::V_ASHR_I32_e64:
1528 case AMDGPU::V_ASHR_I32_e32:
1529 case AMDGPU::S_ASHR_I32:
1530 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1531 return true;
1532 case AMDGPU::V_ASHRREV_I32_e64:
1533 case AMDGPU::V_ASHRREV_I32_e32:
1534 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1535 return true;
1536 default:
1537 return false;
1538 }
1539}
1540
1541static unsigned getMovOpc(bool IsScalar) {
1542 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1543}
1544
1545static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1546 MI.setDesc(NewDesc);
1547
1548 // Remove any leftover implicit operands from mutating the instruction. e.g.
1549 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1550 // anymore.
1551 const MCInstrDesc &Desc = MI.getDesc();
1552 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1553 Desc.implicit_defs().size();
1554
1555 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1556 MI.removeOperand(I);
1557}
1558
1559std::optional<int64_t>
1560SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1561 if (Op.isImm())
1562 return Op.getImm();
1563
1564 if (!Op.isReg() || !Op.getReg().isVirtual())
1565 return std::nullopt;
1566
1567 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1568 if (Def && Def->isMoveImmediate()) {
1569 const MachineOperand &ImmSrc = Def->getOperand(1);
1570 if (ImmSrc.isImm())
1571 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1572 }
1573
1574 return std::nullopt;
1575}
1576
1577// Try to simplify operations with a constant that may appear after instruction
1578// selection.
1579// TODO: See if a frame index with a fixed offset can fold.
1580bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1581 if (!MI->allImplicitDefsAreDead())
1582 return false;
1583
1584 unsigned Opc = MI->getOpcode();
1585
1586 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1587 if (Src0Idx == -1)
1588 return false;
1589
1590 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1591 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1592
1593 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1594 Opc == AMDGPU::S_NOT_B32) &&
1595 Src0Imm) {
1596 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1597 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1598 return true;
1599 }
1600
1601 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1602 if (Src1Idx == -1)
1603 return false;
1604
1605 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1606 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1607
1608 if (!Src0Imm && !Src1Imm)
1609 return false;
1610
1611 // and k0, k1 -> v_mov_b32 (k0 & k1)
1612 // or k0, k1 -> v_mov_b32 (k0 | k1)
1613 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1614 if (Src0Imm && Src1Imm) {
1615 int32_t NewImm;
1616 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1617 return false;
1618
1619 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1620
1621 // Be careful to change the right operand, src0 may belong to a different
1622 // instruction.
1623 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1624 MI->removeOperand(Src1Idx);
1625 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1626 return true;
1627 }
1628
1629 if (!MI->isCommutable())
1630 return false;
1631
1632 if (Src0Imm && !Src1Imm) {
1633 std::swap(Src0, Src1);
1634 std::swap(Src0Idx, Src1Idx);
1635 std::swap(Src0Imm, Src1Imm);
1636 }
1637
1638 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1639 if (Opc == AMDGPU::V_OR_B32_e64 ||
1640 Opc == AMDGPU::V_OR_B32_e32 ||
1641 Opc == AMDGPU::S_OR_B32) {
1642 if (Src1Val == 0) {
1643 // y = or x, 0 => y = copy x
1644 MI->removeOperand(Src1Idx);
1645 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1646 } else if (Src1Val == -1) {
1647 // y = or x, -1 => y = v_mov_b32 -1
1648 MI->removeOperand(Src1Idx);
1649 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1650 } else
1651 return false;
1652
1653 return true;
1654 }
1655
1656 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1657 Opc == AMDGPU::S_AND_B32) {
1658 if (Src1Val == 0) {
1659 // y = and x, 0 => y = v_mov_b32 0
1660 MI->removeOperand(Src0Idx);
1661 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1662 } else if (Src1Val == -1) {
1663 // y = and x, -1 => y = copy x
1664 MI->removeOperand(Src1Idx);
1665 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1666 } else
1667 return false;
1668
1669 return true;
1670 }
1671
1672 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1673 Opc == AMDGPU::S_XOR_B32) {
1674 if (Src1Val == 0) {
1675 // y = xor x, 0 => y = copy x
1676 MI->removeOperand(Src1Idx);
1677 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1678 return true;
1679 }
1680 }
1681
1682 return false;
1683}
1684
1685// Try to fold an instruction into a simpler one
1686bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1687 unsigned Opc = MI.getOpcode();
1688 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1689 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1690 return false;
1691
1692 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1693 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1694 if (!Src1->isIdenticalTo(*Src0)) {
1695 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1696 if (!Src1Imm)
1697 return false;
1698
1699 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1700 if (!Src0Imm || *Src0Imm != *Src1Imm)
1701 return false;
1702 }
1703
1704 int Src1ModIdx =
1705 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1706 int Src0ModIdx =
1707 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1708 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1709 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1710 return false;
1711
1712 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1713 auto &NewDesc =
1714 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1715 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1716 if (Src2Idx != -1)
1717 MI.removeOperand(Src2Idx);
1718 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1719 if (Src1ModIdx != -1)
1720 MI.removeOperand(Src1ModIdx);
1721 if (Src0ModIdx != -1)
1722 MI.removeOperand(Src0ModIdx);
1723 mutateCopyOp(MI, NewDesc);
1724 LLVM_DEBUG(dbgs() << MI);
1725 return true;
1726}
1727
1728bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1729 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1730 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1731 return false;
1732
1733 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
1734 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1735 return false;
1736
1737 Register Src1 = MI.getOperand(2).getReg();
1738 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1739 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1740 return false;
1741
1742 Register Dst = MI.getOperand(0).getReg();
1743 MRI->replaceRegWith(Dst, Src1);
1744 if (!MI.getOperand(2).isKill())
1745 MRI->clearKillFlags(Src1);
1746 MI.eraseFromParent();
1747 return true;
1748}
1749
1750bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1751 const FoldableDef &OpToFold) const {
1752 // We need mutate the operands of new mov instructions to add implicit
1753 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1754 // this.
1755 SmallVector<MachineInstr *, 4> CopiesToReplace;
1757 MachineOperand &Dst = MI.getOperand(0);
1758 bool Changed = false;
1759
1760 if (OpToFold.isImm()) {
1761 for (auto &UseMI :
1762 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1763 // Folding the immediate may reveal operations that can be constant
1764 // folded or replaced with a copy. This can happen for example after
1765 // frame indices are lowered to constants or from splitting 64-bit
1766 // constants.
1767 //
1768 // We may also encounter cases where one or both operands are
1769 // immediates materialized into a register, which would ordinarily not
1770 // be folded due to multiple uses or operand constraints.
1771 if (tryConstantFoldOp(&UseMI)) {
1772 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1773 Changed = true;
1774 }
1775 }
1776 }
1777
1779 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1780 for (auto *U : UsesToProcess) {
1781 MachineInstr *UseMI = U->getParent();
1782
1783 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1784 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1785 CopiesToReplace);
1786 }
1787
1788 if (CopiesToReplace.empty() && FoldList.empty())
1789 return Changed;
1790
1791 MachineFunction *MF = MI.getParent()->getParent();
1792 // Make sure we add EXEC uses to any new v_mov instructions created.
1793 for (MachineInstr *Copy : CopiesToReplace)
1794 Copy->addImplicitDefUseOperands(*MF);
1795
1796 SetVector<MachineInstr *> ConstantFoldCandidates;
1797 for (FoldCandidate &Fold : FoldList) {
1798 assert(!Fold.isReg() || Fold.Def.OpToFold);
1799 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1800 Register Reg = Fold.getReg();
1801 const MachineInstr *DefMI = Fold.Def.DefMI;
1802 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1803 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1804 continue;
1805 }
1806 if (updateOperand(Fold)) {
1807 // Clear kill flags.
1808 if (Fold.isReg()) {
1809 assert(Fold.Def.OpToFold && Fold.isReg());
1810 // FIXME: Probably shouldn't bother trying to fold if not an
1811 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1812 // copies.
1813 MRI->clearKillFlags(Fold.getReg());
1814 }
1815 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1816 << static_cast<int>(Fold.UseOpNo) << " of "
1817 << *Fold.UseMI);
1818
1819 if (Fold.isImm())
1820 ConstantFoldCandidates.insert(Fold.UseMI);
1821
1822 } else if (Fold.Commuted) {
1823 // Restoring instruction's original operand order if fold has failed.
1824 TII->commuteInstruction(*Fold.UseMI, false);
1825 }
1826 }
1827
1828 for (MachineInstr *MI : ConstantFoldCandidates) {
1829 if (tryConstantFoldOp(MI)) {
1830 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1831 Changed = true;
1832 }
1833 }
1834 return true;
1835}
1836
1837/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1838/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1839bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1840 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1841 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1842 // initializers right here, so we will rematerialize immediates and avoid
1843 // copies via different reg classes.
1844 const TargetRegisterClass *DefRC =
1845 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1846 if (!TRI->isAGPRClass(DefRC))
1847 return false;
1848
1849 Register UseReg = CopyMI->getOperand(1).getReg();
1850 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1851 if (!RegSeq || !RegSeq->isRegSequence())
1852 return false;
1853
1854 const DebugLoc &DL = CopyMI->getDebugLoc();
1855 MachineBasicBlock &MBB = *CopyMI->getParent();
1856
1857 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1858 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1859
1860 const TargetRegisterClass *UseRC =
1861 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1862
1863 // Value, subregindex for new REG_SEQUENCE
1865
1866 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1867 unsigned NumFoldable = 0;
1868
1869 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1870 MachineOperand &RegOp = RegSeq->getOperand(I);
1871 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1872
1873 if (RegOp.getSubReg()) {
1874 // TODO: Handle subregister compose
1875 NewDefs.emplace_back(&RegOp, SubRegIdx);
1876 continue;
1877 }
1878
1879 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1880 if (!Lookup)
1881 Lookup = &RegOp;
1882
1883 if (Lookup->isImm()) {
1884 // Check if this is an agpr_32 subregister.
1885 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1886 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1887 if (DestSuperRC &&
1888 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1889 ++NumFoldable;
1890 NewDefs.emplace_back(Lookup, SubRegIdx);
1891 continue;
1892 }
1893 }
1894
1895 const TargetRegisterClass *InputRC =
1896 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1897 : MRI->getRegClass(RegOp.getReg());
1898
1899 // TODO: Account for Lookup->getSubReg()
1900
1901 // If we can't find a matching super class, this is an SGPR->AGPR or
1902 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1903 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1904 // want to rewrite to copy to an intermediate VGPR class.
1905 const TargetRegisterClass *MatchRC =
1906 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1907 if (!MatchRC) {
1908 ++NumFoldable;
1909 NewDefs.emplace_back(&RegOp, SubRegIdx);
1910 continue;
1911 }
1912
1913 NewDefs.emplace_back(&RegOp, SubRegIdx);
1914 }
1915
1916 // Do not clone a reg_sequence and merely change the result register class.
1917 if (NumFoldable == 0)
1918 return false;
1919
1920 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1921 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1922 CopyMI->removeOperand(I);
1923
1924 for (auto [Def, DestSubIdx] : NewDefs) {
1925 if (!Def->isReg()) {
1926 // TODO: Should we use single write for each repeated value like in
1927 // register case?
1928 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1929 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1930 .add(*Def);
1931 B.addReg(Tmp);
1932 } else {
1933 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1934 Def->setIsKill(false);
1935
1936 Register &VGPRCopy = VGPRCopies[Src];
1937 if (!VGPRCopy) {
1938 const TargetRegisterClass *VGPRUseSubRC =
1939 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1940
1941 // We cannot build a reg_sequence out of the same registers, they
1942 // must be copied. Better do it here before copyPhysReg() created
1943 // several reads to do the AGPR->VGPR->AGPR copy.
1944
1945 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1946 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1947 // later, create a copy here and track if we already have such a copy.
1948 const TargetRegisterClass *SubRC =
1949 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1950 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1951 // TODO: Try to reconstrain class
1952 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1953 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1954 B.addReg(VGPRCopy);
1955 } else {
1956 // If it is already a VGPR, do not copy the register.
1957 B.add(*Def);
1958 }
1959 } else {
1960 B.addReg(VGPRCopy);
1961 }
1962 }
1963
1964 B.addImm(DestSubIdx);
1965 }
1966
1967 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1968 return true;
1969}
1970
1971bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1972 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1973 Register DstReg = MI.getOperand(0).getReg();
1974 // Specially track simple redefs of m0 to the same value in a block, so we
1975 // can erase the later ones.
1976 if (DstReg == AMDGPU::M0) {
1977 MachineOperand &NewM0Val = MI.getOperand(1);
1978 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1979 MI.eraseFromParent();
1980 return true;
1981 }
1982
1983 // We aren't tracking other physical registers
1984 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1985 ? nullptr
1986 : &NewM0Val;
1987 return false;
1988 }
1989
1990 MachineOperand *OpToFoldPtr;
1991 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1992 // Folding when any src_modifiers are non-zero is unsupported
1993 if (TII->hasAnyModifiersSet(MI))
1994 return false;
1995 OpToFoldPtr = &MI.getOperand(2);
1996 } else
1997 OpToFoldPtr = &MI.getOperand(1);
1998 MachineOperand &OpToFold = *OpToFoldPtr;
1999 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2000
2001 // FIXME: We could also be folding things like TargetIndexes.
2002 if (!FoldingImm && !OpToFold.isReg())
2003 return false;
2004
2005 // Fold virtual registers and constant physical registers.
2006 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2007 !TRI->isConstantPhysReg(OpToFold.getReg()))
2008 return false;
2009
2010 // Prevent folding operands backwards in the function. For example,
2011 // the COPY opcode must not be replaced by 1 in this example:
2012 //
2013 // %3 = COPY %vgpr0; VGPR_32:%3
2014 // ...
2015 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2016 if (!DstReg.isVirtual())
2017 return false;
2018
2019 const TargetRegisterClass *DstRC =
2020 MRI->getRegClass(MI.getOperand(0).getReg());
2021
2022 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2023 // Can remove this code if proper 16-bit SGPRs are implemented
2024 // Example: Pre-peephole-opt
2025 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2026 // %32:sreg_32 = COPY %29:sgpr_lo16
2027 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2028 // Post-peephole-opt and DCE
2029 // %32:sreg_32 = COPY %16.lo16:sreg_32
2030 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2031 // After this transform
2032 // %32:sreg_32 = COPY %16:sreg_32
2033 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2034 // After the fold operands pass
2035 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2036 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2037 OpToFold.getSubReg()) {
2038 if (DstRC == &AMDGPU::SReg_32RegClass &&
2039 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2040 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2041 OpToFold.setSubReg(0);
2042 }
2043 }
2044
2045 // Fold copy to AGPR through reg_sequence
2046 // TODO: Handle with subregister extract
2047 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2048 if (foldCopyToAGPRRegSequence(&MI))
2049 return true;
2050 }
2051
2052 FoldableDef Def(OpToFold, DstRC);
2053 bool Changed = foldInstOperand(MI, Def);
2054
2055 // If we managed to fold all uses of this copy then we might as well
2056 // delete it now.
2057 // The only reason we need to follow chains of copies here is that
2058 // tryFoldRegSequence looks forward through copies before folding a
2059 // REG_SEQUENCE into its eventual users.
2060 auto *InstToErase = &MI;
2061 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2062 auto &SrcOp = InstToErase->getOperand(1);
2063 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2064 InstToErase->eraseFromParent();
2065 Changed = true;
2066 InstToErase = nullptr;
2067 if (!SrcReg || SrcReg.isPhysical())
2068 break;
2069 InstToErase = MRI->getVRegDef(SrcReg);
2070 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2071 break;
2072 }
2073
2074 if (InstToErase && InstToErase->isRegSequence() &&
2075 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2076 InstToErase->eraseFromParent();
2077 Changed = true;
2078 }
2079
2080 if (Changed)
2081 return true;
2082
2083 // Run this after foldInstOperand to avoid turning scalar additions into
2084 // vector additions when the result scalar result could just be folded into
2085 // the user(s).
2086 return OpToFold.isReg() &&
2087 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2088}
2089
2090// Clamp patterns are canonically selected to v_max_* instructions, so only
2091// handle them.
2092const MachineOperand *
2093SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2094 unsigned Op = MI.getOpcode();
2095 switch (Op) {
2096 case AMDGPU::V_MAX_F32_e64:
2097 case AMDGPU::V_MAX_F16_e64:
2098 case AMDGPU::V_MAX_F16_t16_e64:
2099 case AMDGPU::V_MAX_F16_fake16_e64:
2100 case AMDGPU::V_MAX_F64_e64:
2101 case AMDGPU::V_MAX_NUM_F64_e64:
2102 case AMDGPU::V_PK_MAX_F16:
2103 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2104 case AMDGPU::V_PK_MAX_NUM_BF16: {
2105 if (MI.mayRaiseFPException())
2106 return nullptr;
2107
2108 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2109 return nullptr;
2110
2111 // Make sure sources are identical.
2112 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2113 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2114 if (!Src0->isReg() || !Src1->isReg() ||
2115 Src0->getReg() != Src1->getReg() ||
2116 Src0->getSubReg() != Src1->getSubReg() ||
2117 Src0->getSubReg() != AMDGPU::NoSubRegister)
2118 return nullptr;
2119
2120 // Can't fold up if we have modifiers.
2121 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2122 return nullptr;
2123
2124 unsigned Src0Mods
2125 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2126 unsigned Src1Mods
2127 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2128
2129 // Having a 0 op_sel_hi would require swizzling the output in the source
2130 // instruction, which we can't do.
2131 unsigned UnsetMods =
2132 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2134 : 0u;
2135 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2136 return nullptr;
2137 return Src0;
2138 }
2139 default:
2140 return nullptr;
2141 }
2142}
2143
2144// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2145bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2146 const MachineOperand *ClampSrc = isClamp(MI);
2147 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2148 return false;
2149
2150 if (!ClampSrc->getReg().isVirtual())
2151 return false;
2152
2153 // Look through COPY. COPY only observed with True16.
2154 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2155 MachineInstr *Def =
2156 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2157
2158 // The type of clamp must be compatible.
2159 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2160 return false;
2161
2162 if (Def->mayRaiseFPException())
2163 return false;
2164
2165 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2166 if (!DefClamp)
2167 return false;
2168
2169 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2170
2171 // Clamp is applied after omod, so it is OK if omod is set.
2172 DefClamp->setImm(1);
2173
2174 Register DefReg = Def->getOperand(0).getReg();
2175 Register MIDstReg = MI.getOperand(0).getReg();
2176 if (TRI->isSGPRReg(*MRI, DefReg)) {
2177 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2178 // instruction with a VGPR dst.
2179 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2180 MIDstReg)
2181 .addReg(DefReg);
2182 } else {
2183 MRI->replaceRegWith(MIDstReg, DefReg);
2184 }
2185 MI.eraseFromParent();
2186
2187 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2188 // instruction, so we might as well convert it to the more flexible VOP3-only
2189 // mad/fma form.
2190 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2191 Def->eraseFromParent();
2192
2193 return true;
2194}
2195
2196static int getOModValue(unsigned Opc, int64_t Val) {
2197 switch (Opc) {
2198 case AMDGPU::V_MUL_F64_e64:
2199 case AMDGPU::V_MUL_F64_pseudo_e64: {
2200 switch (Val) {
2201 case 0x3fe0000000000000: // 0.5
2202 return SIOutMods::DIV2;
2203 case 0x4000000000000000: // 2.0
2204 return SIOutMods::MUL2;
2205 case 0x4010000000000000: // 4.0
2206 return SIOutMods::MUL4;
2207 default:
2208 return SIOutMods::NONE;
2209 }
2210 }
2211 case AMDGPU::V_MUL_F32_e64: {
2212 switch (static_cast<uint32_t>(Val)) {
2213 case 0x3f000000: // 0.5
2214 return SIOutMods::DIV2;
2215 case 0x40000000: // 2.0
2216 return SIOutMods::MUL2;
2217 case 0x40800000: // 4.0
2218 return SIOutMods::MUL4;
2219 default:
2220 return SIOutMods::NONE;
2221 }
2222 }
2223 case AMDGPU::V_MUL_F16_e64:
2224 case AMDGPU::V_MUL_F16_t16_e64:
2225 case AMDGPU::V_MUL_F16_fake16_e64: {
2226 switch (static_cast<uint16_t>(Val)) {
2227 case 0x3800: // 0.5
2228 return SIOutMods::DIV2;
2229 case 0x4000: // 2.0
2230 return SIOutMods::MUL2;
2231 case 0x4400: // 4.0
2232 return SIOutMods::MUL4;
2233 default:
2234 return SIOutMods::NONE;
2235 }
2236 }
2237 default:
2238 llvm_unreachable("invalid mul opcode");
2239 }
2240}
2241
2242// FIXME: Does this really not support denormals with f16?
2243// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2244// handled, so will anything other than that break?
2245std::pair<const MachineOperand *, int>
2246SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2247 unsigned Op = MI.getOpcode();
2248 switch (Op) {
2249 case AMDGPU::V_MUL_F64_e64:
2250 case AMDGPU::V_MUL_F64_pseudo_e64:
2251 case AMDGPU::V_MUL_F32_e64:
2252 case AMDGPU::V_MUL_F16_t16_e64:
2253 case AMDGPU::V_MUL_F16_fake16_e64:
2254 case AMDGPU::V_MUL_F16_e64: {
2255 // If output denormals are enabled, omod is ignored.
2256 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2258 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2259 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2260 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2263 MI.mayRaiseFPException())
2264 return std::pair(nullptr, SIOutMods::NONE);
2265
2266 const MachineOperand *RegOp = nullptr;
2267 const MachineOperand *ImmOp = nullptr;
2268 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2269 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2270 if (Src0->isImm()) {
2271 ImmOp = Src0;
2272 RegOp = Src1;
2273 } else if (Src1->isImm()) {
2274 ImmOp = Src1;
2275 RegOp = Src0;
2276 } else
2277 return std::pair(nullptr, SIOutMods::NONE);
2278
2279 int OMod = getOModValue(Op, ImmOp->getImm());
2280 if (OMod == SIOutMods::NONE ||
2281 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2282 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2283 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2284 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2285 return std::pair(nullptr, SIOutMods::NONE);
2286
2287 return std::pair(RegOp, OMod);
2288 }
2289 case AMDGPU::V_ADD_F64_e64:
2290 case AMDGPU::V_ADD_F64_pseudo_e64:
2291 case AMDGPU::V_ADD_F32_e64:
2292 case AMDGPU::V_ADD_F16_e64:
2293 case AMDGPU::V_ADD_F16_t16_e64:
2294 case AMDGPU::V_ADD_F16_fake16_e64: {
2295 // If output denormals are enabled, omod is ignored.
2296 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2298 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2299 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2300 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2302 return std::pair(nullptr, SIOutMods::NONE);
2303
2304 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2305 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2306 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2307
2308 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2309 Src0->getSubReg() == Src1->getSubReg() &&
2310 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2311 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2312 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2313 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2314 return std::pair(Src0, SIOutMods::MUL2);
2315
2316 return std::pair(nullptr, SIOutMods::NONE);
2317 }
2318 default:
2319 return std::pair(nullptr, SIOutMods::NONE);
2320 }
2321}
2322
2323// FIXME: Does this need to check IEEE bit on function?
2324bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2325 const MachineOperand *RegOp;
2326 int OMod;
2327 std::tie(RegOp, OMod) = isOMod(MI);
2328 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2329 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2330 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2331 return false;
2332
2333 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2334 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2335 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2336 return false;
2337
2338 if (Def->mayRaiseFPException())
2339 return false;
2340
2341 // Clamp is applied after omod. If the source already has clamp set, don't
2342 // fold it.
2343 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2344 return false;
2345
2346 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2347
2348 DefOMod->setImm(OMod);
2349 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2350 // Kill flags can be wrong if we replaced a def inside a loop with a def
2351 // outside the loop.
2352 MRI->clearKillFlags(Def->getOperand(0).getReg());
2353 MI.eraseFromParent();
2354
2355 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2356 // instruction, so we might as well convert it to the more flexible VOP3-only
2357 // mad/fma form.
2358 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2359 Def->eraseFromParent();
2360
2361 return true;
2362}
2363
2364// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2365// instruction which can take an agpr. So far that means a store.
2366bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2367 assert(MI.isRegSequence());
2368 auto Reg = MI.getOperand(0).getReg();
2369
2370 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2371 !MRI->hasOneNonDBGUse(Reg))
2372 return false;
2373
2375 if (!getRegSeqInit(Defs, Reg))
2376 return false;
2377
2378 for (auto &[Op, SubIdx] : Defs) {
2379 if (!Op->isReg())
2380 return false;
2381 if (TRI->isAGPR(*MRI, Op->getReg()))
2382 continue;
2383 // Maybe this is a COPY from AREG
2384 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2385 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2386 return false;
2387 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2388 return false;
2389 }
2390
2391 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2392 MachineInstr *UseMI = Op->getParent();
2393 while (UseMI->isCopy() && !Op->getSubReg()) {
2394 Reg = UseMI->getOperand(0).getReg();
2395 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2396 return false;
2397 Op = &*MRI->use_nodbg_begin(Reg);
2398 UseMI = Op->getParent();
2399 }
2400
2401 if (Op->getSubReg())
2402 return false;
2403
2404 unsigned OpIdx = Op - &UseMI->getOperand(0);
2405 const MCInstrDesc &InstDesc = UseMI->getDesc();
2406 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
2407 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2408 return false;
2409
2410 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2411 auto Dst = MRI->createVirtualRegister(NewDstRC);
2412 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2413 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2414
2415 for (auto &[Def, SubIdx] : Defs) {
2416 Def->setIsKill(false);
2417 if (TRI->isAGPR(*MRI, Def->getReg())) {
2418 RS.add(*Def);
2419 } else { // This is a copy
2420 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2421 SubDef->getOperand(1).setIsKill(false);
2422 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
2423 }
2424 RS.addImm(SubIdx);
2425 }
2426
2427 Op->setReg(Dst);
2428 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2429 Op->setReg(Reg);
2430 RS->eraseFromParent();
2431 return false;
2432 }
2433
2434 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2435
2436 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2437 // in which case we can erase them all later in runOnMachineFunction.
2438 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2439 MI.eraseFromParent();
2440 return true;
2441}
2442
2443/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2444/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2445static bool isAGPRCopy(const SIRegisterInfo &TRI,
2446 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2447 Register &OutReg, unsigned &OutSubReg) {
2448 assert(Copy.isCopy());
2449
2450 const MachineOperand &CopySrc = Copy.getOperand(1);
2451 Register CopySrcReg = CopySrc.getReg();
2452 if (!CopySrcReg.isVirtual())
2453 return false;
2454
2455 // Common case: copy from AGPR directly, e.g.
2456 // %1:vgpr_32 = COPY %0:agpr_32
2457 if (TRI.isAGPR(MRI, CopySrcReg)) {
2458 OutReg = CopySrcReg;
2459 OutSubReg = CopySrc.getSubReg();
2460 return true;
2461 }
2462
2463 // Sometimes it can also involve two copies, e.g.
2464 // %1:vgpr_256 = COPY %0:agpr_256
2465 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2466 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2467 if (!CopySrcDef || !CopySrcDef->isCopy())
2468 return false;
2469
2470 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2471 Register OtherCopySrcReg = OtherCopySrc.getReg();
2472 if (!OtherCopySrcReg.isVirtual() ||
2473 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2474 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2475 !TRI.isAGPR(MRI, OtherCopySrcReg))
2476 return false;
2477
2478 OutReg = OtherCopySrcReg;
2479 OutSubReg = CopySrc.getSubReg();
2480 return true;
2481}
2482
2483// Try to hoist an AGPR to VGPR copy across a PHI.
2484// This should allow folding of an AGPR into a consumer which may support it.
2485//
2486// Example 1: LCSSA PHI
2487// loop:
2488// %1:vreg = COPY %0:areg
2489// exit:
2490// %2:vreg = PHI %1:vreg, %loop
2491// =>
2492// loop:
2493// exit:
2494// %1:areg = PHI %0:areg, %loop
2495// %2:vreg = COPY %1:areg
2496//
2497// Example 2: PHI with multiple incoming values:
2498// entry:
2499// %1:vreg = GLOBAL_LOAD(..)
2500// loop:
2501// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2502// %3:areg = COPY %2:vreg
2503// %4:areg = (instr using %3:areg)
2504// %5:vreg = COPY %4:areg
2505// =>
2506// entry:
2507// %1:vreg = GLOBAL_LOAD(..)
2508// %2:areg = COPY %1:vreg
2509// loop:
2510// %3:areg = PHI %2:areg, %entry, %X:areg,
2511// %4:areg = (instr using %3:areg)
2512bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2513 assert(PHI.isPHI());
2514
2515 Register PhiOut = PHI.getOperand(0).getReg();
2516 if (!TRI->isVGPR(*MRI, PhiOut))
2517 return false;
2518
2519 // Iterate once over all incoming values of the PHI to check if this PHI is
2520 // eligible, and determine the exact AGPR RC we'll target.
2521 const TargetRegisterClass *ARC = nullptr;
2522 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2523 MachineOperand &MO = PHI.getOperand(K);
2524 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2525 if (!Copy || !Copy->isCopy())
2526 continue;
2527
2528 Register AGPRSrc;
2529 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2530 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2531 continue;
2532
2533 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2534 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2535 CopyInRC = SubRC;
2536
2537 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2538 return false;
2539 ARC = CopyInRC;
2540 }
2541
2542 if (!ARC)
2543 return false;
2544
2545 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2546
2547 // Rewrite the PHI's incoming values to ARC.
2548 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2549 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2550 MachineOperand &MO = PHI.getOperand(K);
2551 Register Reg = MO.getReg();
2552
2554 MachineBasicBlock *InsertMBB = nullptr;
2555
2556 // Look at the def of Reg, ignoring all copies.
2557 unsigned CopyOpc = AMDGPU::COPY;
2558 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2559
2560 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2561 // the copy was single-use, it will be removed by DCE later.
2562 if (Def->isCopy()) {
2563 Register AGPRSrc;
2564 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2565 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2566 MO.setReg(AGPRSrc);
2567 MO.setSubReg(AGPRSubReg);
2568 continue;
2569 }
2570
2571 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2572 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2573 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2574 // is unlikely to be profitable.
2575 //
2576 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2577 MachineOperand &CopyIn = Def->getOperand(1);
2578 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2579 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2580 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2581 }
2582
2583 InsertMBB = Def->getParent();
2584 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2585 } else {
2586 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2587 InsertPt = InsertMBB->getFirstTerminator();
2588 }
2589
2590 Register NewReg = MRI->createVirtualRegister(ARC);
2591 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2592 TII->get(CopyOpc), NewReg)
2593 .addReg(Reg);
2594 MO.setReg(NewReg);
2595
2596 (void)MI;
2597 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2598 }
2599
2600 // Replace the PHI's result with a new register.
2601 Register NewReg = MRI->createVirtualRegister(ARC);
2602 PHI.getOperand(0).setReg(NewReg);
2603
2604 // COPY that new register back to the original PhiOut register. This COPY will
2605 // usually be folded out later.
2606 MachineBasicBlock *MBB = PHI.getParent();
2607 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2608 TII->get(AMDGPU::COPY), PhiOut)
2609 .addReg(NewReg);
2610
2611 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2612 return true;
2613}
2614
2615// Attempt to convert VGPR load to an AGPR load.
2616bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2617 assert(MI.mayLoad());
2618 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2619 return false;
2620
2621 MachineOperand &Def = MI.getOperand(0);
2622 if (!Def.isDef())
2623 return false;
2624
2625 Register DefReg = Def.getReg();
2626
2627 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2628 return false;
2629
2631 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2632 SmallVector<Register, 8> MoveRegs;
2633
2634 if (Users.empty())
2635 return false;
2636
2637 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2638 while (!Users.empty()) {
2639 const MachineInstr *I = Users.pop_back_val();
2640 if (!I->isCopy() && !I->isRegSequence())
2641 return false;
2642 Register DstReg = I->getOperand(0).getReg();
2643 // Physical registers may have more than one instruction definitions
2644 if (DstReg.isPhysical())
2645 return false;
2646 if (TRI->isAGPR(*MRI, DstReg))
2647 continue;
2648 MoveRegs.push_back(DstReg);
2649 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2650 Users.push_back(&U);
2651 }
2652
2653 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2654 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2655 if (!TII->isOperandLegal(MI, 0, &Def)) {
2656 MRI->setRegClass(DefReg, RC);
2657 return false;
2658 }
2659
2660 while (!MoveRegs.empty()) {
2661 Register Reg = MoveRegs.pop_back_val();
2662 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2663 }
2664
2665 LLVM_DEBUG(dbgs() << "Folded " << MI);
2666
2667 return true;
2668}
2669
2670// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2671// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2672// there's cases where it can create a lot more AGPR-AGPR copies, which are
2673// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2674//
2675// This function looks at all AGPR PHIs in a basic block and collects their
2676// operands. Then, it checks for register that are used more than once across
2677// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2678// having to create one VGPR temporary per use, which can get very messy if
2679// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2680// element).
2681//
2682// Example
2683// a:
2684// %in:agpr_256 = COPY %foo:vgpr_256
2685// c:
2686// %x:agpr_32 = ..
2687// b:
2688// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2689// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2690// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2691// =>
2692// a:
2693// %in:agpr_256 = COPY %foo:vgpr_256
2694// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2695// %tmp_agpr:agpr_32 = COPY %tmp
2696// c:
2697// %x:agpr_32 = ..
2698// b:
2699// %0:areg = PHI %tmp_agpr, %a, %x, %c
2700// %1:areg = PHI %tmp_agpr, %a, %y, %c
2701// %2:areg = PHI %tmp_agpr, %a, %z, %c
2702bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2703 // This is only really needed on GFX908 where AGPR-AGPR copies are
2704 // unreasonably difficult.
2705 if (ST->hasGFX90AInsts())
2706 return false;
2707
2708 // Look at all AGPR Phis and collect the register + subregister used.
2709 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2710 RegToMO;
2711
2712 for (auto &MI : MBB) {
2713 if (!MI.isPHI())
2714 break;
2715
2716 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2717 continue;
2718
2719 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2720 MachineOperand &PhiMO = MI.getOperand(K);
2721 if (!PhiMO.getSubReg())
2722 continue;
2723 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2724 }
2725 }
2726
2727 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2728 // a VGPR.
2729 bool Changed = false;
2730 for (const auto &[Entry, MOs] : RegToMO) {
2731 if (MOs.size() == 1)
2732 continue;
2733
2734 const auto [Reg, SubReg] = Entry;
2735 MachineInstr *Def = MRI->getVRegDef(Reg);
2736 MachineBasicBlock *DefMBB = Def->getParent();
2737
2738 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2739 // out.
2740 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2741 Register TempVGPR =
2742 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2743 MachineInstr *VGPRCopy =
2744 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2745 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2746 .addReg(Reg, /* flags */ 0, SubReg);
2747
2748 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2749 Register TempAGPR = MRI->createVirtualRegister(ARC);
2750 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2751 TII->get(AMDGPU::COPY), TempAGPR)
2752 .addReg(TempVGPR);
2753
2754 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2755 for (MachineOperand *MO : MOs) {
2756 MO->setReg(TempAGPR);
2757 MO->setSubReg(AMDGPU::NoSubRegister);
2758 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2759 }
2760
2761 Changed = true;
2762 }
2763
2764 return Changed;
2765}
2766
2767bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2768 this->MF = &MF;
2769 MRI = &MF.getRegInfo();
2770 ST = &MF.getSubtarget<GCNSubtarget>();
2771 TII = ST->getInstrInfo();
2772 TRI = &TII->getRegisterInfo();
2773 MFI = MF.getInfo<SIMachineFunctionInfo>();
2774
2775 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2776 // correctly handle signed zeros.
2777 //
2778 // FIXME: Also need to check strictfp
2779 bool IsIEEEMode = MFI->getMode().IEEE;
2780 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2781
2782 bool Changed = false;
2783 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2784 MachineOperand *CurrentKnownM0Val = nullptr;
2785 for (auto &MI : make_early_inc_range(*MBB)) {
2786 Changed |= tryFoldCndMask(MI);
2787
2788 if (tryFoldZeroHighBits(MI)) {
2789 Changed = true;
2790 continue;
2791 }
2792
2793 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2794 Changed = true;
2795 continue;
2796 }
2797
2798 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2799 Changed = true;
2800 continue;
2801 }
2802
2803 if (MI.mayLoad() && tryFoldLoad(MI)) {
2804 Changed = true;
2805 continue;
2806 }
2807
2808 if (TII->isFoldableCopy(MI)) {
2809 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2810 continue;
2811 }
2812
2813 // Saw an unknown clobber of m0, so we no longer know what it is.
2814 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2815 CurrentKnownM0Val = nullptr;
2816
2817 // TODO: Omod might be OK if there is NSZ only on the source
2818 // instruction, and not the omod multiply.
2819 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2820 !tryFoldOMod(MI))
2821 Changed |= tryFoldClamp(MI);
2822 }
2823
2824 Changed |= tryOptimizeAGPRPhis(*MBB);
2825 }
2826
2827 return Changed;
2828}
2829
2832 MFPropsModifier _(*this, MF);
2833
2834 bool Changed = SIFoldOperandsImpl().run(MF);
2835 if (!Changed) {
2836 return PreservedAnalyses::all();
2837 }
2839 PA.preserveSet<CFGAnalyses>();
2840 return PA;
2841}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:130
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:169
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.