23#define DEBUG_TYPE "si-fold-operands"
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
49 FoldableDef() =
delete;
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.
getType()) {
55 ImmToFold = FoldOp.
getImm();
56 }
else if (FoldOp.
isFI()) {
57 FrameIndexToFold = FoldOp.
getIndex();
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
73 FoldableDef Copy(*
this);
74 Copy.DefSubReg =
TRI.composeSubRegIndices(DefSubReg,
SubReg);
82 return OpToFold->getReg();
85 unsigned getSubReg()
const {
87 return OpToFold->getSubReg();
98 return FrameIndexToFold;
106 std::optional<int64_t> getEffectiveImmVal()
const {
114 unsigned OpIdx)
const {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
127 if (DefSubReg != AMDGPU::NoSubRegister)
135 if (DefSubReg != AMDGPU::NoSubRegister)
137 return TII.isOperandLegal(
MI,
OpIdx, OpToFold);
144struct FoldCandidate {
152 bool Commuted =
false,
int ShrinkOp = -1)
153 :
UseMI(
MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
156 bool isFI()
const {
return Def.isFI(); }
160 return Def.FrameIndexToFold;
163 bool isImm()
const {
return Def.isImm(); }
165 bool isReg()
const {
return Def.isReg(); }
169 bool isGlobal()
const {
return Def.isGlobal(); }
171 bool needsShrink()
const {
return ShrinkOpcode != -1; }
174class SIFoldOperandsImpl {
184 const FoldableDef &OpToFold)
const;
187 unsigned convertToVALUOp(
unsigned Opc,
bool UseVOP3 =
false)
const {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
201 return AMDGPU::INSTRUCTION_LIST_END;
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(
Register DstReg,
Register SrcReg,
211 int64_t ImmVal)
const;
215 int64_t ImmVal)
const;
219 const FoldableDef &OpToFold)
const;
228 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
231 std::pair<int64_t, const TargetRegisterClass *>
249 bool foldInstOperand(
MachineInstr &
MI,
const FoldableDef &OpToFold)
const;
251 bool foldCopyToAGPRRegSequence(
MachineInstr *CopyMI)
const;
258 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
267 SIFoldOperandsImpl() =
default;
281 return SIFoldOperandsImpl().run(MF);
284 StringRef getPassName()
const override {
return "SI Fold Operands"; }
301char SIFoldOperandsLegacy::
ID = 0;
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
335 return AMDGPU::INSTRUCTION_LIST_END;
341 const FoldableDef &OpToFold)
const {
342 if (!OpToFold.isFI())
345 const unsigned Opc =
UseMI.getOpcode();
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
365 return OpNo == AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
369 int SIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
373 int VIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
382 if (
TRI->isVGPR(*
MRI, DstReg) &&
TRI->isSGPRReg(*
MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *
Def =
MRI->getVRegDef(SrcReg);
385 if (!Def ||
Def->getNumOperands() != 4)
388 MachineOperand *Src0 = &
Def->getOperand(1);
389 MachineOperand *Src1 = &
Def->getOperand(2);
400 const bool UseVOP3 = !Src0->
isImm() ||
TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(
Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !
Def->getOperand(3).isDead())
406 MachineBasicBlock *
MBB =
Def->getParent();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder
Add =
412 if (
Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
415 MRI->setRegAllocationHint(CarryOutReg, 0,
TRI->getVCC());
418 Add.add(*Src0).add(*Src1).setMIFlags(
Def->getFlags());
422 Def->eraseFromParent();
423 MI.eraseFromParent();
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
438 Def->eraseFromParent();
439 MI.eraseFromParent();
448 return new SIFoldOperandsLegacy();
451bool SIFoldOperandsImpl::canUseImmWithOpSel(
const MachineInstr *
MI,
453 int64_t ImmVal)
const {
454 const uint64_t TSFlags =
MI->getDesc().TSFlags;
462 int OpNo =
MI->getOperandNo(&Old);
464 unsigned Opcode =
MI->getOpcode();
465 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *
MI,
unsigned UseOpNo,
488 int64_t ImmVal)
const {
489 MachineOperand &Old =
MI->getOperand(UseOpNo);
490 unsigned Opcode =
MI->getOpcode();
491 int OpNo =
MI->getOperandNo(&Old);
492 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
509 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
512 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &
Mod =
MI->getOperand(ModIdx);
519 unsigned ModVal =
Mod.getImm();
525 uint32_t
Imm = (
static_cast<uint32_t
>(ImmHi) << 16) | ImmLo;
530 auto tryFoldToInline = [&](uint32_t
Imm) ->
bool {
539 uint16_t
Lo =
static_cast<uint16_t
>(
Imm);
540 uint16_t
Hi =
static_cast<uint16_t
>(
Imm >> 16);
543 Mod.setImm(NewModVal);
548 if (
static_cast<int16_t
>(
Lo) < 0) {
549 int32_t SExt =
static_cast<int16_t
>(
Lo);
551 Mod.setImm(NewModVal);
566 uint32_t Swapped = (
static_cast<uint32_t
>(
Lo) << 16) |
Hi;
577 if (tryFoldToInline(Imm))
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp =
MI->getOperand(ClampIdx).getImm() != 0;
594 uint16_t NegLo = -
static_cast<uint16_t
>(
Imm);
595 uint16_t NegHi = -
static_cast<uint16_t
>(
Imm >> 16);
596 uint32_t NegImm = (
static_cast<uint32_t
>(NegHi) << 16) | NegLo;
598 if (tryFoldToInline(NegImm)) {
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(
TII->get(NegOpcode));
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold)
const {
611 MachineInstr *
MI = Fold.UseMI;
612 MachineOperand &Old =
MI->getOperand(Fold.UseOpNo);
615 std::optional<int64_t> ImmVal;
617 ImmVal = Fold.Def.getEffectiveImmVal();
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
626 int OpNo =
MI->getOperandNo(&Old);
627 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *
MBB =
MI->getParent();
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 =
MI->getOperand(0);
643 MachineOperand &Dst1 =
MI->getOperand(1);
646 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
648 const TargetRegisterClass *Dst0RC =
MRI->getRegClass(Dst0.
getReg());
649 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
651 MachineInstr *Inst32 =
TII->buildShrunkInst(*
MI, Op32);
653 if (HaveNonDbgCarryUse) {
666 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
667 MI->removeOperand(
I);
668 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
671 TII->commuteInstruction(*Inst32,
false);
675 assert(!Fold.needsShrink() &&
"not handled");
680 if (NewMFMAOpc == -1)
682 MI->setDesc(
TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
688 int OpNo =
MI->getOperandNo(&Old);
689 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
696 if (Fold.isGlobal()) {
697 Old.
ChangeToGA(Fold.Def.OpToFold->getGlobal(),
698 Fold.Def.OpToFold->getOffset(),
699 Fold.Def.OpToFold->getTargetFlags());
708 MachineOperand *
New = Fold.Def.OpToFold;
711 if (
const TargetRegisterClass *OpRC =
712 TII->getRegClass(
MI->getDesc(), Fold.UseOpNo,
TRI)) {
713 const TargetRegisterClass *NewRC =
714 TRI->getRegClassForReg(*
MRI,
New->getReg());
715 const TargetRegisterClass *ConstrainRC =
716 TRI->findCommonRegClass(OpRC, Old.
getSubReg(), NewRC,
New->getSubReg());
720 if (!
MRI->constrainRegClass(
New->getReg(), ConstrainRC)) {
722 <<
TRI->getRegClassName(ConstrainRC) <<
'\n');
731 if (
New->getReg().isPhysical()) {
741 FoldCandidate &&Entry) {
743 for (FoldCandidate &Fold : FoldList)
744 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
746 LLVM_DEBUG(
dbgs() <<
"Append " << (Entry.Commuted ?
"commuted" :
"normal")
747 <<
" operand " << Entry.UseOpNo <<
"\n " << *Entry.UseMI);
753 const FoldableDef &FoldOp,
754 bool Commuted =
false,
int ShrinkOp = -1) {
756 FoldCandidate(
MI, OpNo, FoldOp, Commuted, ShrinkOp));
759bool SIFoldOperandsImpl::tryAddToFoldList(
760 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *
MI,
unsigned OpNo,
761 const FoldableDef &OpToFold)
const {
762 const unsigned Opc =
MI->getOpcode();
764 auto tryToFoldAsFMAAKorMK = [&]() {
765 if (!OpToFold.isImm())
768 const bool TryAK = OpNo == 3;
769 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
770 MI->setDesc(
TII->get(NewOpc));
773 bool FoldAsFMAAKorMK =
774 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
775 if (FoldAsFMAAKorMK) {
777 MI->untieRegOperand(3);
780 MachineOperand &Op1 =
MI->getOperand(1);
781 MachineOperand &Op2 =
MI->getOperand(2);
798 bool IsLegal = OpToFold.isOperandLegal(*
TII, *
MI, OpNo);
799 if (!IsLegal && OpToFold.isImm()) {
800 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
801 IsLegal = canUseImmWithOpSel(
MI, OpNo, *ImmVal);
807 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
810 MI->setDesc(
TII->get(NewOpc));
815 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
817 MI->untieRegOperand(OpNo);
821 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
827 if (
Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
828 if (tryToFoldAsFMAAKorMK())
833 if (OpToFold.isImm()) {
835 if (
Opc == AMDGPU::S_SETREG_B32)
836 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
837 else if (
Opc == AMDGPU::S_SETREG_B32_mode)
838 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
840 MI->setDesc(
TII->get(ImmOpc));
849 bool CanCommute =
TII->findCommutedOpIndices(*
MI, OpNo, CommuteOpNo);
853 MachineOperand &
Op =
MI->getOperand(OpNo);
854 MachineOperand &CommutedOp =
MI->getOperand(CommuteOpNo);
860 if (!
Op.isReg() || !CommutedOp.
isReg())
865 if (
Op.isReg() && CommutedOp.
isReg() &&
866 (
Op.getReg() == CommutedOp.
getReg() &&
870 if (!
TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo))
874 if (!OpToFold.isOperandLegal(*
TII, *
MI, CommuteOpNo)) {
875 if ((
Opc != AMDGPU::V_ADD_CO_U32_e64 &&
Opc != AMDGPU::V_SUB_CO_U32_e64 &&
876 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
877 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
878 TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo);
884 MachineOperand &OtherOp =
MI->getOperand(OpNo);
885 if (!OtherOp.
isReg() ||
892 unsigned MaybeCommutedOpc =
MI->getOpcode();
906 if (
Opc == AMDGPU::S_FMAC_F32 &&
907 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
908 if (tryToFoldAsFMAAKorMK())
916bool SIFoldOperandsImpl::isUseSafeToFold(
const MachineInstr &
MI,
917 const MachineOperand &UseMO)
const {
919 return !
TII->isSDWA(
MI);
927 SubDef &&
TII.isFoldableCopy(*SubDef);
928 SubDef =
MRI.getVRegDef(
Sub->getReg())) {
936 if (
SrcOp.getSubReg())
943const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
944 MachineInstr &RegSeq,
945 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs)
const {
949 const TargetRegisterClass *RC =
nullptr;
959 else if (!
TRI->getCommonSubClass(RC, OpRC))
964 Defs.emplace_back(&SrcOp, SubRegIdx);
969 if (DefSrc && (DefSrc->
isReg() || DefSrc->
isImm())) {
970 Defs.emplace_back(DefSrc, SubRegIdx);
974 Defs.emplace_back(&SrcOp, SubRegIdx);
983const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
984 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
987 if (!Def || !
Def->isRegSequence())
990 return getRegSeqInit(*Def, Defs);
993std::pair<int64_t, const TargetRegisterClass *>
994SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq)
const {
996 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1000 bool TryToMatchSplat64 =
false;
1003 for (
unsigned I = 0,
E = Defs.
size();
I !=
E; ++
I) {
1004 const MachineOperand *
Op = Defs[
I].first;
1008 int64_t SubImm =
Op->getImm();
1014 if (Imm != SubImm) {
1015 if (
I == 1 && (
E & 1) == 0) {
1018 TryToMatchSplat64 =
true;
1026 if (!TryToMatchSplat64)
1027 return {Defs[0].first->getImm(), SrcRC};
1032 for (
unsigned I = 0,
E = Defs.
size();
I !=
E;
I += 2) {
1033 const MachineOperand *Op0 = Defs[
I].first;
1034 const MachineOperand *Op1 = Defs[
I + 1].first;
1039 unsigned SubReg0 = Defs[
I].second;
1040 unsigned SubReg1 = Defs[
I + 1].second;
1044 if (
TRI->getChannelFromSubReg(SubReg0) + 1 !=
1045 TRI->getChannelFromSubReg(SubReg1))
1050 SplatVal64 = MergedVal;
1051 else if (SplatVal64 != MergedVal)
1055 const TargetRegisterClass *RC64 =
TRI->getSubRegisterClass(
1058 return {SplatVal64, RC64};
1061bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1062 MachineInstr *
UseMI,
unsigned UseOpIdx, int64_t SplatVal,
1063 const TargetRegisterClass *SplatRC)
const {
1065 if (UseOpIdx >=
Desc.getNumOperands())
1072 int16_t RCID =
Desc.operands()[UseOpIdx].RegClass;
1076 const TargetRegisterClass *OpRC =
TRI->getRegClass(RCID);
1081 if (SplatVal != 0 && SplatVal != -1) {
1085 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
1091 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1096 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1102 if (!
TRI->getCommonSubClass(OpRC, SplatRC))
1107 if (!
TII->isOperandLegal(*
UseMI, UseOpIdx, &TmpOp))
1113bool SIFoldOperandsImpl::tryToFoldACImm(
1114 const FoldableDef &OpToFold, MachineInstr *
UseMI,
unsigned UseOpIdx,
1115 SmallVectorImpl<FoldCandidate> &FoldList)
const {
1117 if (UseOpIdx >=
Desc.getNumOperands())
1125 if (OpToFold.isImm() && OpToFold.isOperandLegal(*
TII, *
UseMI, UseOpIdx)) {
1135 if (!OpToFold.isReg())
1148 if (Def &&
TII->isFoldableCopy(*Def)) {
1149 MachineOperand &DefOp =
Def->getOperand(1);
1150 if (DefOp.
isImm() &&
TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
1151 FoldableDef FoldableImm(DefOp.
getImm(), OpToFold.DefRC,
1152 OpToFold.DefSubReg);
1161void SIFoldOperandsImpl::foldOperand(
1162 FoldableDef OpToFold, MachineInstr *
UseMI,
int UseOpIdx,
1163 SmallVectorImpl<FoldCandidate> &FoldList,
1164 SmallVectorImpl<MachineInstr *> &CopiesToReplace)
const {
1167 if (!isUseSafeToFold(*
UseMI, *UseOp))
1171 if (UseOp->
isReg() && OpToFold.isReg()) {
1175 if (UseOp->
getSubReg() != AMDGPU::NoSubRegister &&
1177 !
TRI->isSGPRReg(*
MRI, OpToFold.getReg())))
1189 const TargetRegisterClass *SplatRC;
1190 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*
UseMI);
1195 for (
unsigned I = 0;
I != UsesToProcess.size(); ++
I) {
1196 MachineOperand *RSUse = UsesToProcess[
I];
1197 MachineInstr *RSUseMI = RSUse->
getParent();
1207 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1208 FoldableDef SplatDef(SplatVal, SplatRC);
1215 if (RSUse->
getSubReg() != RegSeqDstSubReg)
1220 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(RSUse), FoldList,
1227 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
1230 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
1235 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1241 MachineOperand &SOff =
1242 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
1253 TII->getNamedOperand(*
UseMI, AMDGPU::OpName::cpol)->getImm();
1268 bool FoldingImmLike =
1269 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1277 const TargetRegisterClass *SrcRC =
MRI->getRegClass(SrcReg);
1285 const TargetRegisterClass *DestRC =
TRI->getRegClassForReg(*
MRI, DestReg);
1288 for (
unsigned MovOp :
1289 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1290 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1291 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1292 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1293 const MCInstrDesc &MovDesc =
TII->get(MovOp);
1296 const TargetRegisterClass *MovDstRC =
1305 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1306 const TargetRegisterClass *MovSrcRC =
1307 TRI->getRegClass(MovDesc.
operands()[SrcIdx].RegClass);
1310 MovSrcRC =
TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1311 if (!
MRI->constrainRegClass(SrcReg, MovSrcRC))
1321 if (!OpToFold.isImm() ||
1322 !
TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1328 while (ImpOpI != ImpOpE) {
1335 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1337 MachineOperand NewSrcOp(SrcOp);
1359 LLVM_DEBUG(
dbgs() <<
"Folding " << OpToFold.OpToFold <<
"\n into "
1364 unsigned SubRegIdx = OpToFold.getSubReg();
1378 static_assert(AMDGPU::sub1_hi16 == 12,
"Subregister layout has changed");
1383 if (SubRegIdx > AMDGPU::sub1) {
1384 LaneBitmask
M =
TRI->getSubRegIndexLaneMask(SubRegIdx);
1385 M |=
M.getLane(
M.getHighestLane() - 1);
1386 SmallVector<unsigned, 4> Indexes;
1387 TRI->getCoveringSubRegIndexes(
TRI->getRegClassForReg(*
MRI,
UseReg), M,
1389 assert(Indexes.
size() == 1 &&
"Expected one 32-bit subreg to cover");
1390 SubRegIdx = Indexes[0];
1392 }
else if (
TII->getOpSize(*
UseMI, 1) == 4)
1395 SubRegIdx = AMDGPU::sub0;
1400 OpToFold.OpToFold->setIsKill(
false);
1404 if (foldCopyToAGPRRegSequence(
UseMI))
1409 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1410 (UseOpc == AMDGPU::V_READLANE_B32 &&
1412 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1417 if (FoldingImmLike) {
1420 *OpToFold.DefMI, *
UseMI))
1425 if (OpToFold.isImm()) {
1427 *OpToFold.getEffectiveImmVal());
1428 }
else if (OpToFold.isFI())
1431 assert(OpToFold.isGlobal());
1433 OpToFold.OpToFold->getOffset(),
1434 OpToFold.OpToFold->getTargetFlags());
1440 if (OpToFold.isReg() &&
TRI->isSGPRReg(*
MRI, OpToFold.getReg())) {
1443 *OpToFold.DefMI, *
UseMI))
1464 UseDesc.
operands()[UseOpIdx].RegClass == -1)
1472 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, OpToFold);
1478 case AMDGPU::V_AND_B32_e64:
1479 case AMDGPU::V_AND_B32_e32:
1480 case AMDGPU::S_AND_B32:
1483 case AMDGPU::V_OR_B32_e64:
1484 case AMDGPU::V_OR_B32_e32:
1485 case AMDGPU::S_OR_B32:
1488 case AMDGPU::V_XOR_B32_e64:
1489 case AMDGPU::V_XOR_B32_e32:
1490 case AMDGPU::S_XOR_B32:
1493 case AMDGPU::S_XNOR_B32:
1496 case AMDGPU::S_NAND_B32:
1499 case AMDGPU::S_NOR_B32:
1502 case AMDGPU::S_ANDN2_B32:
1505 case AMDGPU::S_ORN2_B32:
1508 case AMDGPU::V_LSHL_B32_e64:
1509 case AMDGPU::V_LSHL_B32_e32:
1510 case AMDGPU::S_LSHL_B32:
1512 Result =
LHS << (
RHS & 31);
1514 case AMDGPU::V_LSHLREV_B32_e64:
1515 case AMDGPU::V_LSHLREV_B32_e32:
1516 Result =
RHS << (
LHS & 31);
1518 case AMDGPU::V_LSHR_B32_e64:
1519 case AMDGPU::V_LSHR_B32_e32:
1520 case AMDGPU::S_LSHR_B32:
1521 Result =
LHS >> (
RHS & 31);
1523 case AMDGPU::V_LSHRREV_B32_e64:
1524 case AMDGPU::V_LSHRREV_B32_e32:
1525 Result =
RHS >> (
LHS & 31);
1527 case AMDGPU::V_ASHR_I32_e64:
1528 case AMDGPU::V_ASHR_I32_e32:
1529 case AMDGPU::S_ASHR_I32:
1530 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1532 case AMDGPU::V_ASHRREV_I32_e64:
1533 case AMDGPU::V_ASHRREV_I32_e32:
1534 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1542 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1546 MI.setDesc(NewDesc);
1552 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1553 Desc.implicit_defs().size();
1555 for (
unsigned I =
MI.getNumOperands() - 1;
I >=
NumOps; --
I)
1556 MI.removeOperand(
I);
1559std::optional<int64_t>
1560SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &
Op)
const {
1564 if (!
Op.isReg() || !
Op.getReg().isVirtual())
1565 return std::nullopt;
1567 const MachineInstr *
Def =
MRI->getVRegDef(
Op.getReg());
1568 if (Def &&
Def->isMoveImmediate()) {
1569 const MachineOperand &ImmSrc =
Def->getOperand(1);
1571 return TII->extractSubregFromImm(ImmSrc.
getImm(),
Op.getSubReg());
1574 return std::nullopt;
1580bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *
MI)
const {
1581 if (!
MI->allImplicitDefsAreDead())
1584 unsigned Opc =
MI->getOpcode();
1586 int Src0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0);
1590 MachineOperand *Src0 = &
MI->getOperand(Src0Idx);
1591 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1593 if ((
Opc == AMDGPU::V_NOT_B32_e64 ||
Opc == AMDGPU::V_NOT_B32_e32 ||
1594 Opc == AMDGPU::S_NOT_B32) &&
1596 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1601 int Src1Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1);
1605 MachineOperand *Src1 = &
MI->getOperand(Src1Idx);
1606 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1608 if (!Src0Imm && !Src1Imm)
1614 if (Src0Imm && Src1Imm) {
1619 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1623 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1624 MI->removeOperand(Src1Idx);
1629 if (!
MI->isCommutable())
1632 if (Src0Imm && !Src1Imm) {
1638 int32_t Src1Val =
static_cast<int32_t
>(*Src1Imm);
1639 if (
Opc == AMDGPU::V_OR_B32_e64 ||
1640 Opc == AMDGPU::V_OR_B32_e32 ||
1641 Opc == AMDGPU::S_OR_B32) {
1644 MI->removeOperand(Src1Idx);
1646 }
else if (Src1Val == -1) {
1648 MI->removeOperand(Src1Idx);
1656 if (
Opc == AMDGPU::V_AND_B32_e64 ||
Opc == AMDGPU::V_AND_B32_e32 ||
1657 Opc == AMDGPU::S_AND_B32) {
1660 MI->removeOperand(Src0Idx);
1662 }
else if (Src1Val == -1) {
1664 MI->removeOperand(Src1Idx);
1672 if (
Opc == AMDGPU::V_XOR_B32_e64 ||
Opc == AMDGPU::V_XOR_B32_e32 ||
1673 Opc == AMDGPU::S_XOR_B32) {
1676 MI->removeOperand(Src1Idx);
1686bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &
MI)
const {
1687 unsigned Opc =
MI.getOpcode();
1688 if (
Opc != AMDGPU::V_CNDMASK_B32_e32 &&
Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1689 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1692 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1693 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1695 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1699 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1700 if (!Src0Imm || *Src0Imm != *Src1Imm)
1705 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1_modifiers);
1707 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0_modifiers);
1708 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1709 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1715 int Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
1717 MI.removeOperand(Src2Idx);
1718 MI.removeOperand(AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1));
1719 if (Src1ModIdx != -1)
1720 MI.removeOperand(Src1ModIdx);
1721 if (Src0ModIdx != -1)
1722 MI.removeOperand(Src0ModIdx);
1728bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &
MI)
const {
1729 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1730 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1733 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(
MI.getOperand(1));
1734 if (!Src0Imm || *Src0Imm != 0xffff || !
MI.getOperand(2).isReg())
1738 MachineInstr *SrcDef =
MRI->getVRegDef(Src1);
1743 MRI->replaceRegWith(Dst, Src1);
1744 if (!
MI.getOperand(2).isKill())
1745 MRI->clearKillFlags(Src1);
1746 MI.eraseFromParent();
1750bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &
MI,
1751 const FoldableDef &OpToFold)
const {
1755 SmallVector<MachineInstr *, 4> CopiesToReplace;
1757 MachineOperand &Dst =
MI.getOperand(0);
1760 if (OpToFold.isImm()) {
1771 if (tryConstantFoldOp(&
UseMI)) {
1780 for (
auto *U : UsesToProcess) {
1781 MachineInstr *
UseMI =
U->getParent();
1783 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*
TRI,
U->getSubReg());
1788 if (CopiesToReplace.
empty() && FoldList.
empty())
1791 MachineFunction *MF =
MI.getParent()->getParent();
1793 for (MachineInstr *Copy : CopiesToReplace)
1794 Copy->addImplicitDefUseOperands(*MF);
1796 SetVector<MachineInstr *> ConstantFoldCandidates;
1797 for (FoldCandidate &Fold : FoldList) {
1798 assert(!Fold.isReg() || Fold.Def.OpToFold);
1799 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1801 const MachineInstr *
DefMI = Fold.Def.DefMI;
1809 assert(Fold.Def.OpToFold && Fold.isReg());
1813 MRI->clearKillFlags(Fold.getReg());
1816 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1820 ConstantFoldCandidates.
insert(Fold.UseMI);
1822 }
else if (Fold.Commuted) {
1824 TII->commuteInstruction(*Fold.UseMI,
false);
1828 for (MachineInstr *
MI : ConstantFoldCandidates) {
1829 if (tryConstantFoldOp(
MI)) {
1839bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI)
const {
1844 const TargetRegisterClass *DefRC =
1846 if (!
TRI->isAGPRClass(DefRC))
1850 MachineInstr *RegSeq =
MRI->getVRegDef(
UseReg);
1858 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1860 const TargetRegisterClass *UseRC =
1867 unsigned NumFoldable = 0;
1869 for (
unsigned I = 1;
I != NumRegSeqOperands;
I += 2) {
1885 const TargetRegisterClass *DestSuperRC =
TRI->getMatchingSuperRegClass(
1886 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1895 const TargetRegisterClass *InputRC =
1905 const TargetRegisterClass *MatchRC =
1906 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1917 if (NumFoldable == 0)
1920 CopyMI->
setDesc(
TII->get(AMDGPU::REG_SEQUENCE));
1924 for (
auto [Def, DestSubIdx] : NewDefs) {
1925 if (!
Def->isReg()) {
1928 Register Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1929 BuildMI(
MBB, CopyMI,
DL,
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1934 Def->setIsKill(
false);
1936 Register &VGPRCopy = VGPRCopies[Src];
1938 const TargetRegisterClass *VGPRUseSubRC =
1939 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1948 const TargetRegisterClass *SubRC =
1949 TRI->getSubRegisterClass(
MRI->getRegClass(Src.Reg), Src.SubReg);
1952 VGPRCopy =
MRI->createVirtualRegister(VGPRUseSubRC);
1964 B.addImm(DestSubIdx);
1971bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1972 MachineInstr &
MI, MachineOperand *&CurrentKnownM0Val)
const {
1976 if (DstReg == AMDGPU::M0) {
1977 MachineOperand &NewM0Val =
MI.getOperand(1);
1978 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1979 MI.eraseFromParent();
1990 MachineOperand *OpToFoldPtr;
1991 if (
MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1993 if (
TII->hasAnyModifiersSet(
MI))
1995 OpToFoldPtr = &
MI.getOperand(2);
1997 OpToFoldPtr = &
MI.getOperand(1);
1998 MachineOperand &OpToFold = *OpToFoldPtr;
2002 if (!FoldingImm && !OpToFold.
isReg())
2007 !
TRI->isConstantPhysReg(OpToFold.
getReg()))
2019 const TargetRegisterClass *DstRC =
2020 MRI->getRegClass(
MI.getOperand(0).getReg());
2036 if (
MI.getOpcode() == AMDGPU::COPY && OpToFold.
isReg() &&
2038 if (DstRC == &AMDGPU::SReg_32RegClass &&
2039 DstRC ==
MRI->getRegClass(OpToFold.
getReg())) {
2047 if (OpToFold.
isReg() &&
MI.isCopy() && !
MI.getOperand(1).getSubReg()) {
2048 if (foldCopyToAGPRRegSequence(&
MI))
2052 FoldableDef
Def(OpToFold, DstRC);
2053 bool Changed = foldInstOperand(
MI, Def);
2060 auto *InstToErase = &
MI;
2061 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2062 auto &SrcOp = InstToErase->getOperand(1);
2064 InstToErase->eraseFromParent();
2066 InstToErase =
nullptr;
2069 InstToErase =
MRI->getVRegDef(SrcReg);
2070 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
2074 if (InstToErase && InstToErase->isRegSequence() &&
2075 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2076 InstToErase->eraseFromParent();
2086 return OpToFold.
isReg() &&
2087 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.
getReg(),
MI);
2092const MachineOperand *
2093SIFoldOperandsImpl::isClamp(
const MachineInstr &
MI)
const {
2094 unsigned Op =
MI.getOpcode();
2096 case AMDGPU::V_MAX_F32_e64:
2097 case AMDGPU::V_MAX_F16_e64:
2098 case AMDGPU::V_MAX_F16_t16_e64:
2099 case AMDGPU::V_MAX_F16_fake16_e64:
2100 case AMDGPU::V_MAX_F64_e64:
2101 case AMDGPU::V_MAX_NUM_F64_e64:
2102 case AMDGPU::V_PK_MAX_F16:
2103 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2104 case AMDGPU::V_PK_MAX_NUM_BF16: {
2105 if (
MI.mayRaiseFPException())
2108 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
2112 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2113 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2117 Src0->
getSubReg() != AMDGPU::NoSubRegister)
2121 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2125 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
2127 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
2131 unsigned UnsetMods =
2132 (
Op == AMDGPU::V_PK_MAX_F16 ||
Op == AMDGPU::V_PK_MAX_NUM_BF16)
2135 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2145bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &
MI) {
2146 const MachineOperand *ClampSrc = isClamp(
MI);
2147 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
2159 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
2162 if (
Def->mayRaiseFPException())
2165 MachineOperand *DefClamp =
TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2169 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
2175 Register MIDstReg =
MI.getOperand(0).getReg();
2176 if (
TRI->isSGPRReg(*
MRI, DefReg)) {
2183 MRI->replaceRegWith(MIDstReg, DefReg);
2185 MI.eraseFromParent();
2190 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2191 Def->eraseFromParent();
2198 case AMDGPU::V_MUL_F64_e64:
2199 case AMDGPU::V_MUL_F64_pseudo_e64: {
2201 case 0x3fe0000000000000:
2203 case 0x4000000000000000:
2205 case 0x4010000000000000:
2211 case AMDGPU::V_MUL_F32_e64: {
2212 switch (
static_cast<uint32_t>(Val)) {
2223 case AMDGPU::V_MUL_F16_e64:
2224 case AMDGPU::V_MUL_F16_t16_e64:
2225 case AMDGPU::V_MUL_F16_fake16_e64: {
2226 switch (
static_cast<uint16_t>(Val)) {
2245std::pair<const MachineOperand *, int>
2246SIFoldOperandsImpl::isOMod(
const MachineInstr &
MI)
const {
2247 unsigned Op =
MI.getOpcode();
2249 case AMDGPU::V_MUL_F64_e64:
2250 case AMDGPU::V_MUL_F64_pseudo_e64:
2251 case AMDGPU::V_MUL_F32_e64:
2252 case AMDGPU::V_MUL_F16_t16_e64:
2253 case AMDGPU::V_MUL_F16_fake16_e64:
2254 case AMDGPU::V_MUL_F16_e64: {
2256 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
2258 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2259 Op == AMDGPU::V_MUL_F16_e64 ||
Op == AMDGPU::V_MUL_F16_t16_e64 ||
2260 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2263 MI.mayRaiseFPException())
2266 const MachineOperand *RegOp =
nullptr;
2267 const MachineOperand *ImmOp =
nullptr;
2268 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2269 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2270 if (Src0->
isImm()) {
2273 }
else if (Src1->
isImm()) {
2281 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
2282 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
2283 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
2284 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
2287 return std::pair(RegOp, OMod);
2289 case AMDGPU::V_ADD_F64_e64:
2290 case AMDGPU::V_ADD_F64_pseudo_e64:
2291 case AMDGPU::V_ADD_F32_e64:
2292 case AMDGPU::V_ADD_F16_e64:
2293 case AMDGPU::V_ADD_F16_t16_e64:
2294 case AMDGPU::V_ADD_F16_fake16_e64: {
2296 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
2298 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2299 Op == AMDGPU::V_ADD_F16_e64 ||
Op == AMDGPU::V_ADD_F16_t16_e64 ||
2300 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2305 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2306 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2310 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
2311 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
2312 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
2313 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2324bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &
MI) {
2325 const MachineOperand *RegOp;
2327 std::tie(RegOp, OMod) = isOMod(
MI);
2329 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
2330 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
2334 MachineOperand *DefOMod =
TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2338 if (
Def->mayRaiseFPException())
2343 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2349 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
2352 MRI->clearKillFlags(
Def->getOperand(0).getReg());
2353 MI.eraseFromParent();
2358 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2359 Def->eraseFromParent();
2366bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &
MI) {
2368 auto Reg =
MI.getOperand(0).getReg();
2371 !
MRI->hasOneNonDBGUse(
Reg))
2375 if (!getRegSeqInit(Defs,
Reg))
2378 for (
auto &[
Op, SubIdx] : Defs) {
2381 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
2384 const MachineInstr *SubDef =
MRI->getVRegDef(
Op->getReg());
2391 MachineOperand *
Op = &*
MRI->use_nodbg_begin(
Reg);
2392 MachineInstr *
UseMI =
Op->getParent();
2401 if (
Op->getSubReg())
2406 const TargetRegisterClass *OpRC =
TII->getRegClass(InstDesc,
OpIdx,
TRI);
2407 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
2410 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(
Reg));
2411 auto Dst =
MRI->createVirtualRegister(NewDstRC);
2413 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2415 for (
auto &[Def, SubIdx] : Defs) {
2416 Def->setIsKill(
false);
2420 MachineInstr *SubDef =
MRI->getVRegDef(
Def->getReg());
2430 RS->eraseFromParent();
2438 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
2439 MI.eraseFromParent();
2447 Register &OutReg,
unsigned &OutSubReg) {
2457 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
2458 OutReg = CopySrcReg;
2467 if (!CopySrcDef || !CopySrcDef->
isCopy())
2474 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
2475 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
2478 OutReg = OtherCopySrcReg;
2512bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &
PHI) {
2516 if (!
TRI->isVGPR(*
MRI, PhiOut))
2521 const TargetRegisterClass *ARC =
nullptr;
2522 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2523 MachineOperand &MO =
PHI.getOperand(K);
2525 if (!Copy || !
Copy->isCopy())
2529 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2533 const TargetRegisterClass *CopyInRC =
MRI->getRegClass(AGPRSrc);
2534 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2545 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2549 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2550 MachineOperand &MO =
PHI.getOperand(K);
2554 MachineBasicBlock *InsertMBB =
nullptr;
2557 unsigned CopyOpc = AMDGPU::COPY;
2558 if (MachineInstr *Def =
MRI->getVRegDef(
Reg)) {
2562 if (
Def->isCopy()) {
2564 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2577 MachineOperand &CopyIn =
Def->getOperand(1);
2580 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2583 InsertMBB =
Def->getParent();
2590 Register NewReg =
MRI->createVirtualRegister(ARC);
2591 MachineInstr *
MI =
BuildMI(*InsertMBB, InsertPt,
PHI.getDebugLoc(),
2592 TII->get(CopyOpc), NewReg)
2601 Register NewReg =
MRI->createVirtualRegister(ARC);
2602 PHI.getOperand(0).setReg(NewReg);
2608 TII->get(AMDGPU::COPY), PhiOut)
2616bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &
MI) {
2621 MachineOperand &
Def =
MI.getOperand(0);
2638 while (!
Users.empty()) {
2639 const MachineInstr *
I =
Users.pop_back_val();
2640 if (!
I->isCopy() && !
I->isRegSequence())
2642 Register DstReg =
I->getOperand(0).getReg();
2646 if (
TRI->isAGPR(*
MRI, DstReg))
2649 for (
const MachineInstr &U :
MRI->use_nodbg_instructions(DstReg))
2650 Users.push_back(&U);
2653 const TargetRegisterClass *RC =
MRI->getRegClass(DefReg);
2654 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
2655 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
2656 MRI->setRegClass(DefReg, RC);
2660 while (!MoveRegs.
empty()) {
2662 MRI->setRegClass(
Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(
Reg)));
2702bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &
MBB) {
2709 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2712 for (
auto &
MI :
MBB) {
2716 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
2719 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2720 MachineOperand &PhiMO =
MI.getOperand(K);
2730 for (
const auto &[Entry, MOs] : RegToMO) {
2731 if (MOs.size() == 1)
2735 MachineInstr *
Def =
MRI->getVRegDef(
Reg);
2736 MachineBasicBlock *DefMBB =
Def->getParent();
2742 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2743 MachineInstr *VGPRCopy =
2745 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2749 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2751 TII->get(AMDGPU::COPY), TempAGPR)
2755 for (MachineOperand *MO : MOs) {
2767bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2772 TRI = &
TII->getRegisterInfo();
2773 MFI = MF.
getInfo<SIMachineFunctionInfo>();
2784 MachineOperand *CurrentKnownM0Val =
nullptr;
2788 if (tryFoldZeroHighBits(
MI)) {
2793 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2798 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2803 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2808 if (
TII->isFoldableCopy(
MI)) {
2809 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2814 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2815 CurrentKnownM0Val =
nullptr;
2834 bool Changed = SIFoldOperandsImpl().run(MF);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
bool hasNoSignedZerosFPMath() const
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_INLINE_C_FP64
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_INT64
@ OPERAND_REG_IMM_NOINLINE_V2FP16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
@ OPERAND_REG_INLINE_AC_FP32
@ OPERAND_REG_INLINE_C_FP32
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
@ OPERAND_REG_INLINE_AC_FP64
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.