LLVM 22.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
64#include "SIDefines.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "si-load-store-opt"
72
73namespace {
74enum InstClassEnum {
75 UNKNOWN,
76 DS_READ,
77 DS_WRITE,
78 S_BUFFER_LOAD_IMM,
79 S_BUFFER_LOAD_SGPR_IMM,
80 S_LOAD_IMM,
81 BUFFER_LOAD,
82 BUFFER_STORE,
83 MIMG,
84 TBUFFER_LOAD,
85 TBUFFER_STORE,
86 GLOBAL_LOAD_SADDR,
87 GLOBAL_STORE_SADDR,
88 FLAT_LOAD,
89 FLAT_STORE,
90 FLAT_LOAD_SADDR,
91 FLAT_STORE_SADDR,
92 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
93 GLOBAL_STORE // any CombineInfo, they are only ever returned by
94 // getCommonInstClass.
95};
96
97struct AddressRegs {
98 unsigned char NumVAddrs = 0;
99 bool SBase = false;
100 bool SRsrc = false;
101 bool SOffset = false;
102 bool SAddr = false;
103 bool VAddr = false;
104 bool Addr = false;
105 bool SSamp = false;
106};
107
108// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
109const unsigned MaxAddressRegs = 12 + 1 + 1;
110
111class SILoadStoreOptimizer {
112 struct CombineInfo {
114 unsigned EltSize;
115 unsigned Offset;
116 unsigned Width;
117 unsigned Format;
118 unsigned BaseOff;
119 unsigned DMask;
120 InstClassEnum InstClass;
121 unsigned CPol = 0;
122 const TargetRegisterClass *DataRC;
123 bool UseST64;
124 int AddrIdx[MaxAddressRegs];
125 const MachineOperand *AddrReg[MaxAddressRegs];
126 unsigned NumAddresses;
127 unsigned Order;
128
129 bool hasSameBaseAddress(const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
131 return false;
132
133 const MachineInstr &MI = *CI.I;
134 for (unsigned i = 0; i < NumAddresses; i++) {
135 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
136
137 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
139 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
140 return false;
141 }
142 continue;
143 }
144
145 // Check same base pointer. Be careful of subregisters, which can occur
146 // with vectors of pointers.
147 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
148 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
149 return false;
150 }
151 }
152 return true;
153 }
154
155 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
156 for (unsigned i = 0; i < NumAddresses; ++i) {
157 const MachineOperand *AddrOp = AddrReg[i];
158 // Immediates are always OK.
159 if (AddrOp->isImm())
160 continue;
161
162 // Don't try to merge addresses that aren't either immediates or registers.
163 // TODO: Should be possible to merge FrameIndexes and maybe some other
164 // non-register
165 if (!AddrOp->isReg())
166 return false;
167
168 // TODO: We should be able to merge instructions with other physical reg
169 // addresses too.
170 if (AddrOp->getReg().isPhysical() &&
171 AddrOp->getReg() != AMDGPU::SGPR_NULL)
172 return false;
173
174 // If an address has only one use then there will be no other
175 // instructions with the same address, so we can't merge this one.
176 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
177 return false;
178 }
179 return true;
180 }
181
182 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
183
184 // Compare by pointer order.
185 bool operator<(const CombineInfo& Other) const {
186 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
187 }
188 };
189
190 struct BaseRegisters {
191 Register LoReg;
192 Register HiReg;
193
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
196 };
197
198 struct MemAddress {
199 BaseRegisters Base;
200 int64_t Offset = 0;
201 };
202
203 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
204
205private:
206 MachineFunction *MF = nullptr;
207 const GCNSubtarget *STM = nullptr;
208 const SIInstrInfo *TII = nullptr;
209 const SIRegisterInfo *TRI = nullptr;
210 MachineRegisterInfo *MRI = nullptr;
211 AliasAnalysis *AA = nullptr;
212 bool OptimizeAgain;
213
214 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
215 const DenseSet<Register> &ARegUses,
216 const MachineInstr &A, const MachineInstr &B) const;
217 static bool dmasksCanBeCombined(const CombineInfo &CI,
218 const SIInstrInfo &TII,
219 const CombineInfo &Paired);
220 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
221 CombineInfo &Paired, bool Modify = false);
222 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
223 const CombineInfo &Paired);
224 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
225 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
226 const CombineInfo &Paired);
227 const TargetRegisterClass *
228 getTargetRegisterClass(const CombineInfo &CI,
229 const CombineInfo &Paired) const;
230 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
231
232 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
233
234 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
235 MachineBasicBlock::iterator InsertBefore,
236 AMDGPU::OpName OpName, Register DestReg) const;
237 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
238 MachineBasicBlock::iterator InsertBefore,
239 AMDGPU::OpName OpName) const;
240
241 unsigned read2Opcode(unsigned EltSize) const;
242 unsigned read2ST64Opcode(unsigned EltSize) const;
244 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
245 MachineBasicBlock::iterator InsertBefore);
246
247 unsigned write2Opcode(unsigned EltSize) const;
248 unsigned write2ST64Opcode(unsigned EltSize) const;
249 unsigned getWrite2Opcode(const CombineInfo &CI) const;
250
252 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
253 MachineBasicBlock::iterator InsertBefore);
255 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
258 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
261 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
264 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
265 MachineBasicBlock::iterator InsertBefore);
267 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
268 MachineBasicBlock::iterator InsertBefore);
270 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
271 MachineBasicBlock::iterator InsertBefore);
273 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
274 MachineBasicBlock::iterator InsertBefore);
276 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
277 MachineBasicBlock::iterator InsertBefore);
278
279 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
280 int32_t NewOffset) const;
281 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
282 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
283 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
284 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
285 /// Promotes constant offset to the immediate by adjusting the base. It
286 /// tries to use a base from the nearby instructions that allows it to have
287 /// a 13bit constant offset which gets promoted to the immediate.
288 bool promoteConstantOffsetToImm(MachineInstr &CI,
289 MemInfoMap &Visited,
290 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
291 void addInstToMergeableList(const CombineInfo &CI,
292 std::list<std::list<CombineInfo> > &MergeableInsts) const;
293
294 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
296 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
297 std::list<std::list<CombineInfo>> &MergeableInsts) const;
298
299 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
300 const CombineInfo &Paired);
301
302 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
303 const CombineInfo &Paired);
304
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
308
309public:
310 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
311 bool run(MachineFunction &MF);
312};
313
314class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
315public:
316 static char ID;
317
318 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
319
320 bool runOnMachineFunction(MachineFunction &MF) override;
321
322 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
323
324 void getAnalysisUsage(AnalysisUsage &AU) const override {
325 AU.setPreservesCFG();
327
329 }
330
332 return MachineFunctionProperties().setIsSSA();
333 }
334};
335
336static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
337 const unsigned Opc = MI.getOpcode();
338
339 if (TII.isMUBUF(Opc)) {
340 // FIXME: Handle d16 correctly
342 }
343 if (TII.isImage(MI)) {
344 uint64_t DMaskImm =
345 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
346 return llvm::popcount(DMaskImm);
347 }
348 if (TII.isMTBUF(Opc)) {
350 }
351
352 switch (Opc) {
353 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
355 case AMDGPU::S_LOAD_DWORD_IMM:
356 case AMDGPU::GLOBAL_LOAD_DWORD:
357 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
358 case AMDGPU::GLOBAL_STORE_DWORD:
359 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
360 case AMDGPU::FLAT_LOAD_DWORD:
361 case AMDGPU::FLAT_STORE_DWORD:
362 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
363 case AMDGPU::FLAT_STORE_DWORD_SADDR:
364 return 1;
365 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
367 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
368 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
369 case AMDGPU::S_LOAD_DWORDX2_IMM:
370 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
371 case AMDGPU::GLOBAL_LOAD_DWORDX2:
372 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
373 case AMDGPU::GLOBAL_STORE_DWORDX2:
374 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
375 case AMDGPU::FLAT_LOAD_DWORDX2:
376 case AMDGPU::FLAT_STORE_DWORDX2:
377 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
378 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
379 return 2;
380 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
381 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
382 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
383 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
384 case AMDGPU::S_LOAD_DWORDX3_IMM:
385 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
386 case AMDGPU::GLOBAL_LOAD_DWORDX3:
387 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
388 case AMDGPU::GLOBAL_STORE_DWORDX3:
389 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
390 case AMDGPU::FLAT_LOAD_DWORDX3:
391 case AMDGPU::FLAT_STORE_DWORDX3:
392 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
393 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
394 return 3;
395 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
396 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
397 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
399 case AMDGPU::S_LOAD_DWORDX4_IMM:
400 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
401 case AMDGPU::GLOBAL_LOAD_DWORDX4:
402 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
403 case AMDGPU::GLOBAL_STORE_DWORDX4:
404 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
405 case AMDGPU::FLAT_LOAD_DWORDX4:
406 case AMDGPU::FLAT_STORE_DWORDX4:
407 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
408 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
409 return 4;
410 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
411 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
412 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
413 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
414 case AMDGPU::S_LOAD_DWORDX8_IMM:
415 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
416 return 8;
417 case AMDGPU::DS_READ_B32:
418 case AMDGPU::DS_READ_B32_gfx9:
419 case AMDGPU::DS_WRITE_B32:
420 case AMDGPU::DS_WRITE_B32_gfx9:
421 return 1;
422 case AMDGPU::DS_READ_B64:
423 case AMDGPU::DS_READ_B64_gfx9:
424 case AMDGPU::DS_WRITE_B64:
425 case AMDGPU::DS_WRITE_B64_gfx9:
426 return 2;
427 default:
428 return 0;
429 }
430}
431
432/// Maps instruction opcode to enum InstClassEnum.
433static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
434 switch (Opc) {
435 default:
436 if (TII.isMUBUF(Opc)) {
438 default:
439 return UNKNOWN;
440 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
441 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
442 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
443 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
444 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
445 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
446 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
447 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
448 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
453 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
456 return BUFFER_LOAD;
457 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
458 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
459 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
460 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
461 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
462 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
463 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
464 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
465 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
470 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
473 return BUFFER_STORE;
474 }
475 }
476 if (TII.isImage(Opc)) {
477 // Ignore instructions encoded without vaddr.
478 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
479 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
480 return UNKNOWN;
481 // Ignore BVH instructions
483 return UNKNOWN;
484 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
485 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
486 TII.isGather4(Opc))
487 return UNKNOWN;
488 return MIMG;
489 }
490 if (TII.isMTBUF(Opc)) {
492 default:
493 return UNKNOWN;
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
510 return TBUFFER_LOAD;
511 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
512 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
513 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
514 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
515 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
516 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
519 return TBUFFER_STORE;
520 }
521 }
522 return UNKNOWN;
523 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
529 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
532 return S_BUFFER_LOAD_IMM;
533 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
538 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
539 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
542 return S_BUFFER_LOAD_SGPR_IMM;
543 case AMDGPU::S_LOAD_DWORD_IMM:
544 case AMDGPU::S_LOAD_DWORDX2_IMM:
545 case AMDGPU::S_LOAD_DWORDX3_IMM:
546 case AMDGPU::S_LOAD_DWORDX4_IMM:
547 case AMDGPU::S_LOAD_DWORDX8_IMM:
548 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
549 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
550 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
551 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
552 return S_LOAD_IMM;
553 case AMDGPU::DS_READ_B32:
554 case AMDGPU::DS_READ_B32_gfx9:
555 case AMDGPU::DS_READ_B64:
556 case AMDGPU::DS_READ_B64_gfx9:
557 return DS_READ;
558 case AMDGPU::DS_WRITE_B32:
559 case AMDGPU::DS_WRITE_B32_gfx9:
560 case AMDGPU::DS_WRITE_B64:
561 case AMDGPU::DS_WRITE_B64_gfx9:
562 return DS_WRITE;
563 case AMDGPU::GLOBAL_LOAD_DWORD:
564 case AMDGPU::GLOBAL_LOAD_DWORDX2:
565 case AMDGPU::GLOBAL_LOAD_DWORDX3:
566 case AMDGPU::GLOBAL_LOAD_DWORDX4:
567 case AMDGPU::FLAT_LOAD_DWORD:
568 case AMDGPU::FLAT_LOAD_DWORDX2:
569 case AMDGPU::FLAT_LOAD_DWORDX3:
570 case AMDGPU::FLAT_LOAD_DWORDX4:
571 return FLAT_LOAD;
572 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
573 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
574 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
575 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
576 return GLOBAL_LOAD_SADDR;
577 case AMDGPU::GLOBAL_STORE_DWORD:
578 case AMDGPU::GLOBAL_STORE_DWORDX2:
579 case AMDGPU::GLOBAL_STORE_DWORDX3:
580 case AMDGPU::GLOBAL_STORE_DWORDX4:
581 case AMDGPU::FLAT_STORE_DWORD:
582 case AMDGPU::FLAT_STORE_DWORDX2:
583 case AMDGPU::FLAT_STORE_DWORDX3:
584 case AMDGPU::FLAT_STORE_DWORDX4:
585 return FLAT_STORE;
586 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
587 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
588 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
589 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
590 return GLOBAL_STORE_SADDR;
591 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
592 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
593 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
594 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
595 return FLAT_LOAD_SADDR;
596 case AMDGPU::FLAT_STORE_DWORD_SADDR:
597 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
598 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
599 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
600 return FLAT_STORE_SADDR;
601 }
602}
603
604/// Determines instruction subclass from opcode. Only instructions
605/// of the same subclass can be merged together. The merged instruction may have
606/// a different subclass but must have the same class.
607static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
608 switch (Opc) {
609 default:
610 if (TII.isMUBUF(Opc))
612 if (TII.isImage(Opc)) {
614 assert(Info);
615 return Info->BaseOpcode;
616 }
617 if (TII.isMTBUF(Opc))
619 return -1;
620 case AMDGPU::DS_READ_B32:
621 case AMDGPU::DS_READ_B32_gfx9:
622 case AMDGPU::DS_READ_B64:
623 case AMDGPU::DS_READ_B64_gfx9:
624 case AMDGPU::DS_WRITE_B32:
625 case AMDGPU::DS_WRITE_B32_gfx9:
626 case AMDGPU::DS_WRITE_B64:
627 case AMDGPU::DS_WRITE_B64_gfx9:
628 return Opc;
629 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
630 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
631 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
632 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
635 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
638 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
639 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
644 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
645 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
648 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
649 case AMDGPU::S_LOAD_DWORD_IMM:
650 case AMDGPU::S_LOAD_DWORDX2_IMM:
651 case AMDGPU::S_LOAD_DWORDX3_IMM:
652 case AMDGPU::S_LOAD_DWORDX4_IMM:
653 case AMDGPU::S_LOAD_DWORDX8_IMM:
654 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
655 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
656 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
657 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
658 return AMDGPU::S_LOAD_DWORD_IMM;
659 case AMDGPU::GLOBAL_LOAD_DWORD:
660 case AMDGPU::GLOBAL_LOAD_DWORDX2:
661 case AMDGPU::GLOBAL_LOAD_DWORDX3:
662 case AMDGPU::GLOBAL_LOAD_DWORDX4:
663 case AMDGPU::FLAT_LOAD_DWORD:
664 case AMDGPU::FLAT_LOAD_DWORDX2:
665 case AMDGPU::FLAT_LOAD_DWORDX3:
666 case AMDGPU::FLAT_LOAD_DWORDX4:
667 return AMDGPU::FLAT_LOAD_DWORD;
668 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
669 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
670 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
671 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
672 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
673 case AMDGPU::GLOBAL_STORE_DWORD:
674 case AMDGPU::GLOBAL_STORE_DWORDX2:
675 case AMDGPU::GLOBAL_STORE_DWORDX3:
676 case AMDGPU::GLOBAL_STORE_DWORDX4:
677 case AMDGPU::FLAT_STORE_DWORD:
678 case AMDGPU::FLAT_STORE_DWORDX2:
679 case AMDGPU::FLAT_STORE_DWORDX3:
680 case AMDGPU::FLAT_STORE_DWORDX4:
681 return AMDGPU::FLAT_STORE_DWORD;
682 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
683 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
684 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
685 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
686 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
687 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
688 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
689 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
690 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
691 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
692 case AMDGPU::FLAT_STORE_DWORD_SADDR:
693 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
694 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
695 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
696 return AMDGPU::FLAT_STORE_DWORD_SADDR;
697 }
698}
699
700// GLOBAL loads and stores are classified as FLAT initially. If both combined
701// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
702// If either or both instructions are non segment specific FLAT the resulting
703// combined operation will be FLAT, potentially promoting one of the GLOBAL
704// operations to FLAT.
705// For other instructions return the original unmodified class.
706InstClassEnum
707SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
708 const CombineInfo &Paired) {
709 assert(CI.InstClass == Paired.InstClass);
710
711 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
713 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
714
715 return CI.InstClass;
716}
717
718static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
719 AddressRegs Result;
720
721 if (TII.isMUBUF(Opc)) {
723 Result.VAddr = true;
725 Result.SRsrc = true;
727 Result.SOffset = true;
728
729 return Result;
730 }
731
732 if (TII.isImage(Opc)) {
733 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
734 if (VAddr0Idx >= 0) {
735 AMDGPU::OpName RsrcName =
736 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
737 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
738 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
739 } else {
740 Result.VAddr = true;
741 }
742 Result.SRsrc = true;
744 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
745 Result.SSamp = true;
746
747 return Result;
748 }
749 if (TII.isMTBUF(Opc)) {
751 Result.VAddr = true;
753 Result.SRsrc = true;
755 Result.SOffset = true;
756
757 return Result;
758 }
759
760 switch (Opc) {
761 default:
762 return Result;
763 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
764 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
765 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
766 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
767 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
768 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
769 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
772 Result.SOffset = true;
773 [[fallthrough]];
774 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
778 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
779 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
780 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
783 case AMDGPU::S_LOAD_DWORD_IMM:
784 case AMDGPU::S_LOAD_DWORDX2_IMM:
785 case AMDGPU::S_LOAD_DWORDX3_IMM:
786 case AMDGPU::S_LOAD_DWORDX4_IMM:
787 case AMDGPU::S_LOAD_DWORDX8_IMM:
788 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
789 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
790 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
791 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
792 Result.SBase = true;
793 return Result;
794 case AMDGPU::DS_READ_B32:
795 case AMDGPU::DS_READ_B64:
796 case AMDGPU::DS_READ_B32_gfx9:
797 case AMDGPU::DS_READ_B64_gfx9:
798 case AMDGPU::DS_WRITE_B32:
799 case AMDGPU::DS_WRITE_B64:
800 case AMDGPU::DS_WRITE_B32_gfx9:
801 case AMDGPU::DS_WRITE_B64_gfx9:
802 Result.Addr = true;
803 return Result;
804 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
805 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
806 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
807 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
808 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
809 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
810 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
811 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
812 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
813 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
814 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
815 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
816 case AMDGPU::FLAT_STORE_DWORD_SADDR:
817 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
818 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
819 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
820 Result.SAddr = true;
821 [[fallthrough]];
822 case AMDGPU::GLOBAL_LOAD_DWORD:
823 case AMDGPU::GLOBAL_LOAD_DWORDX2:
824 case AMDGPU::GLOBAL_LOAD_DWORDX3:
825 case AMDGPU::GLOBAL_LOAD_DWORDX4:
826 case AMDGPU::GLOBAL_STORE_DWORD:
827 case AMDGPU::GLOBAL_STORE_DWORDX2:
828 case AMDGPU::GLOBAL_STORE_DWORDX3:
829 case AMDGPU::GLOBAL_STORE_DWORDX4:
830 case AMDGPU::FLAT_LOAD_DWORD:
831 case AMDGPU::FLAT_LOAD_DWORDX2:
832 case AMDGPU::FLAT_LOAD_DWORDX3:
833 case AMDGPU::FLAT_LOAD_DWORDX4:
834 case AMDGPU::FLAT_STORE_DWORD:
835 case AMDGPU::FLAT_STORE_DWORDX2:
836 case AMDGPU::FLAT_STORE_DWORDX3:
837 case AMDGPU::FLAT_STORE_DWORDX4:
838 Result.VAddr = true;
839 return Result;
840 }
841}
842
843void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
844 const SILoadStoreOptimizer &LSO) {
845 I = MI;
846 unsigned Opc = MI->getOpcode();
847 InstClass = getInstClass(Opc, *LSO.TII);
848
849 if (InstClass == UNKNOWN)
850 return;
851
852 DataRC = LSO.getDataRegClass(*MI);
853
854 switch (InstClass) {
855 case DS_READ:
856 EltSize =
857 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
858 : 4;
859 break;
860 case DS_WRITE:
861 EltSize =
862 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
863 : 4;
864 break;
865 case S_BUFFER_LOAD_IMM:
866 case S_BUFFER_LOAD_SGPR_IMM:
867 case S_LOAD_IMM:
868 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
869 break;
870 default:
871 EltSize = 4;
872 break;
873 }
874
875 if (InstClass == MIMG) {
876 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
877 // Offset is not considered for MIMG instructions.
878 Offset = 0;
879 } else {
880 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
881 Offset = I->getOperand(OffsetIdx).getImm();
882 }
883
884 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
885 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
887 AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);
888 EltSize = Info->BitsPerComp / 8;
889 }
890
891 Width = getOpcodeWidth(*I, *LSO.TII);
892
893 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
894 Offset &= 0xffff;
895 } else if (InstClass != MIMG) {
896 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
897 }
898
899 AddressRegs Regs = getRegs(Opc, *LSO.TII);
900 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
901
902 NumAddresses = 0;
903 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
904 AddrIdx[NumAddresses++] =
905 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
906 if (Regs.Addr)
907 AddrIdx[NumAddresses++] =
908 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
909 if (Regs.SBase)
910 AddrIdx[NumAddresses++] =
911 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
912 if (Regs.SRsrc)
913 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
914 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
915 if (Regs.SOffset)
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
918 if (Regs.SAddr)
919 AddrIdx[NumAddresses++] =
920 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
921 if (Regs.VAddr)
922 AddrIdx[NumAddresses++] =
923 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
924 if (Regs.SSamp)
925 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
926 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
927 assert(NumAddresses <= MaxAddressRegs);
928
929 for (unsigned J = 0; J < NumAddresses; J++)
930 AddrReg[J] = &I->getOperand(AddrIdx[J]);
931}
932
933} // end anonymous namespace.
934
935INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
936 "SI Load Store Optimizer", false, false)
938INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
939 "SI Load Store Optimizer", false, false)
940
941char SILoadStoreOptimizerLegacy::ID = 0;
942
943char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
944
946 return new SILoadStoreOptimizerLegacy();
947}
948
950 DenseSet<Register> &RegDefs,
951 DenseSet<Register> &RegUses) {
952 for (const auto &Op : MI.operands()) {
953 if (!Op.isReg())
954 continue;
955 if (Op.isDef())
956 RegDefs.insert(Op.getReg());
957 if (Op.readsReg())
958 RegUses.insert(Op.getReg());
959 }
960}
961
962bool SILoadStoreOptimizer::canSwapInstructions(
963 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
964 const MachineInstr &A, const MachineInstr &B) const {
965 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
966 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
967 return false;
968 for (const auto &BOp : B.operands()) {
969 if (!BOp.isReg())
970 continue;
971 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
972 return false;
973 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
974 return false;
975 }
976 return true;
977}
978
979// Given that \p CI and \p Paired are adjacent memory operations produce a new
980// MMO for the combined operation with a new access size.
982SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
983 const CombineInfo &Paired) {
984 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
985 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
986
987 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
988
989 // A base pointer for the combined operation is the same as the leading
990 // operation's pointer.
991 if (Paired < CI)
992 std::swap(MMOa, MMOb);
993
994 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
995 // If merging FLAT and GLOBAL set address space to FLAT.
997 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
998
999 MachineFunction *MF = CI.I->getMF();
1000 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
1001}
1002
1003bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
1004 const SIInstrInfo &TII,
1005 const CombineInfo &Paired) {
1006 assert(CI.InstClass == MIMG);
1007
1008 // Ignore instructions with tfe/lwe set.
1009 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1010 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1011
1012 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1013 return false;
1014
1015 // Check other optional immediate operands for equality.
1016 AMDGPU::OpName OperandsToMatch[] = {
1017 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1018 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1019
1020 for (AMDGPU::OpName op : OperandsToMatch) {
1021 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
1022 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
1023 return false;
1024 if (Idx != -1 &&
1025 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1026 return false;
1027 }
1028
1029 // Check DMask for overlaps.
1030 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1031 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1032
1033 if (!MaxMask)
1034 return false;
1035
1036 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
1037 if ((1u << AllowedBitsForMin) <= MinMask)
1038 return false;
1039
1040 return true;
1041}
1042
1043static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
1044 unsigned ComponentCount,
1045 const GCNSubtarget &STI) {
1046 if (ComponentCount > 4)
1047 return 0;
1048
1049 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1051 if (!OldFormatInfo)
1052 return 0;
1053
1054 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1056 ComponentCount,
1057 OldFormatInfo->NumFormat, STI);
1058
1059 if (!NewFormatInfo)
1060 return 0;
1061
1062 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1063 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1064
1065 return NewFormatInfo->Format;
1066}
1067
1068// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1069// highest power of two. Note that the result is well defined for all inputs
1070// including corner cases like:
1071// - if Lo == Hi, return that value
1072// - if Lo == 0, return 0 (even though the "- 1" below underflows
1073// - if Lo > Hi, return 0 (as if the range wrapped around)
1075 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1076}
1077
1078bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1079 const GCNSubtarget &STI,
1080 CombineInfo &Paired,
1081 bool Modify) {
1082 assert(CI.InstClass != MIMG);
1083
1084 // XXX - Would the same offset be OK? Is there any reason this would happen or
1085 // be useful?
1086 if (CI.Offset == Paired.Offset)
1087 return false;
1088
1089 // This won't be valid if the offset isn't aligned.
1090 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1091 return false;
1092
1093 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1094
1098 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1099
1100 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1101 Info0->NumFormat != Info1->NumFormat)
1102 return false;
1103
1104 // For 8-bit or 16-bit formats there is no 3-component variant.
1105 // If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1106 // Example:
1107 // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1108 // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1109 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1110 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1111 NumCombinedComponents = 4;
1112
1113 if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
1114 0)
1115 return false;
1116
1117 // Merge only when the two access ranges are strictly back-to-back,
1118 // any gap or overlap can over-write data or leave holes.
1119 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1120 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1121 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1122 ElemIndex1 + Paired.Width != ElemIndex0)
1123 return false;
1124
1125 // 1-byte formats require 1-byte alignment.
1126 // 2-byte formats require 2-byte alignment.
1127 // 4-byte and larger formats require 4-byte alignment.
1128 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1129 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1130 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1131 if (MinOff % RequiredAlign != 0)
1132 return false;
1133
1134 return true;
1135 }
1136
1137 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1138 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1139 CI.UseST64 = false;
1140 CI.BaseOff = 0;
1141
1142 // Handle all non-DS instructions.
1143 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1144 if (EltOffset0 + CI.Width != EltOffset1 &&
1145 EltOffset1 + Paired.Width != EltOffset0)
1146 return false;
1147 // Instructions with scale_offset modifier cannot be combined unless we
1148 // also generate a code to scale the offset and reset that bit.
1149 if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
1150 return false;
1151 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1152 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1153 // Reject cases like:
1154 // dword + dwordx2 -> dwordx3
1155 // dword + dwordx3 -> dwordx4
1156 // If we tried to combine these cases, we would fail to extract a subreg
1157 // for the result of the second load due to SGPR alignment requirements.
1158 if (CI.Width != Paired.Width &&
1159 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1160 return false;
1161 }
1162 return true;
1163 }
1164
1165 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1166 // the stride 64 versions.
1167 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1168 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1169 if (Modify) {
1170 CI.Offset = EltOffset0 / 64;
1171 Paired.Offset = EltOffset1 / 64;
1172 CI.UseST64 = true;
1173 }
1174 return true;
1175 }
1176
1177 // Check if the new offsets fit in the reduced 8-bit range.
1178 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1179 if (Modify) {
1180 CI.Offset = EltOffset0;
1181 Paired.Offset = EltOffset1;
1182 }
1183 return true;
1184 }
1185
1186 // Try to shift base address to decrease offsets.
1187 uint32_t Min = std::min(EltOffset0, EltOffset1);
1188 uint32_t Max = std::max(EltOffset0, EltOffset1);
1189
1190 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1191 if (((Max - Min) & ~Mask) == 0) {
1192 if (Modify) {
1193 // From the range of values we could use for BaseOff, choose the one that
1194 // is aligned to the highest power of two, to maximise the chance that
1195 // the same offset can be reused for other load/store pairs.
1196 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1197 // Copy the low bits of the offsets, so that when we adjust them by
1198 // subtracting BaseOff they will be multiples of 64.
1199 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1200 CI.BaseOff = BaseOff * CI.EltSize;
1201 CI.Offset = (EltOffset0 - BaseOff) / 64;
1202 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1203 CI.UseST64 = true;
1204 }
1205 return true;
1206 }
1207
1208 if (isUInt<8>(Max - Min)) {
1209 if (Modify) {
1210 // From the range of values we could use for BaseOff, choose the one that
1211 // is aligned to the highest power of two, to maximise the chance that
1212 // the same offset can be reused for other load/store pairs.
1213 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1214 CI.BaseOff = BaseOff * CI.EltSize;
1215 CI.Offset = EltOffset0 - BaseOff;
1216 Paired.Offset = EltOffset1 - BaseOff;
1217 }
1218 return true;
1219 }
1220
1221 return false;
1222}
1223
1224bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1225 const CombineInfo &CI,
1226 const CombineInfo &Paired) {
1227 const unsigned Width = (CI.Width + Paired.Width);
1228 switch (CI.InstClass) {
1229 default:
1230 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1231 case S_BUFFER_LOAD_IMM:
1232 case S_BUFFER_LOAD_SGPR_IMM:
1233 case S_LOAD_IMM:
1234 switch (Width) {
1235 default:
1236 return false;
1237 case 2:
1238 case 4:
1239 case 8:
1240 return true;
1241 case 3:
1242 return STM.hasScalarDwordx3Loads();
1243 }
1244 }
1245}
1246
1247const TargetRegisterClass *
1248SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1249 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1250 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1251 }
1252 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1253 return TRI->getRegClassForReg(*MRI, Src->getReg());
1254 }
1255 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1256 return TRI->getRegClassForReg(*MRI, Src->getReg());
1257 }
1258 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1259 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1260 }
1261 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1262 return TRI->getRegClassForReg(*MRI, Src->getReg());
1263 }
1264 return nullptr;
1265}
1266
1267/// This function assumes that CI comes before Paired in a basic block. Return
1268/// an insertion point for the merged instruction or nullptr on failure.
1269SILoadStoreOptimizer::CombineInfo *
1270SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1271 CombineInfo &Paired) {
1272 // If another instruction has already been merged into CI, it may now be a
1273 // type that we can't do any further merging into.
1274 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1275 return nullptr;
1276 assert(CI.InstClass == Paired.InstClass);
1277
1278 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1279 getInstSubclass(Paired.I->getOpcode(), *TII))
1280 return nullptr;
1281
1282 // Check both offsets (or masks for MIMG) can be combined and fit in the
1283 // reduced range.
1284 if (CI.InstClass == MIMG) {
1285 if (!dmasksCanBeCombined(CI, *TII, Paired))
1286 return nullptr;
1287 } else {
1288 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1289 return nullptr;
1290 }
1291
1292 DenseSet<Register> RegDefs;
1293 DenseSet<Register> RegUses;
1294 CombineInfo *Where;
1295 if (CI.I->mayLoad()) {
1296 // Try to hoist Paired up to CI.
1297 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1298 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1299 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1300 return nullptr;
1301 }
1302 Where = &CI;
1303 } else {
1304 // Try to sink CI down to Paired.
1305 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1306 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1307 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1308 return nullptr;
1309 }
1310 Where = &Paired;
1311 }
1312
1313 // Call offsetsCanBeCombined with modify = true so that the offsets are
1314 // correct for the new instruction. This should return true, because
1315 // this function should only be called on CombineInfo objects that
1316 // have already been confirmed to be mergeable.
1317 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1318 offsetsCanBeCombined(CI, *STM, Paired, true);
1319
1320 if (CI.InstClass == DS_WRITE) {
1321 // Both data operands must be AGPR or VGPR, so the data registers needs to
1322 // be constrained to one or the other. We expect to only emit the VGPR form
1323 // here for now.
1324 //
1325 // FIXME: There is currently a hack in getRegClass to report that the write2
1326 // operands are VGPRs. In the future we should have separate agpr
1327 // instruction definitions.
1328 const MachineOperand *Data0 =
1329 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1330 const MachineOperand *Data1 =
1331 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1332
1333 const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
1334 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1335 AMDGPU::OpName::data0);
1336 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1337 AMDGPU::OpName::data1);
1338
1339 const TargetRegisterClass *DataRC0 =
1340 TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF);
1341
1342 const TargetRegisterClass *DataRC1 =
1343 TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF);
1344
1345 if (unsigned SubReg = Data0->getSubReg()) {
1346 DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
1347 DataRC0, SubReg);
1348 }
1349
1350 if (unsigned SubReg = Data1->getSubReg()) {
1351 DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
1352 DataRC1, SubReg);
1353 }
1354
1355 if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
1356 !MRI->constrainRegClass(Data1->getReg(), DataRC1))
1357 return nullptr;
1358
1359 // TODO: If one register can be constrained, and not the other, insert a
1360 // copy.
1361 }
1362
1363 return Where;
1364}
1365
1366// Copy the merged load result from DestReg to the original dest regs of CI and
1367// Paired.
1368void SILoadStoreOptimizer::copyToDestRegs(
1369 CombineInfo &CI, CombineInfo &Paired,
1370 MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
1371 Register DestReg) const {
1372 MachineBasicBlock *MBB = CI.I->getParent();
1373 DebugLoc DL = CI.I->getDebugLoc();
1374
1375 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1376
1377 // Copy to the old destination registers.
1378 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1379 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1380 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1381
1382 // The constrained sload instructions in S_LOAD_IMM class will have
1383 // `early-clobber` flag in the dst operand. Remove the flag before using the
1384 // MOs in copies.
1385 Dest0->setIsEarlyClobber(false);
1386 Dest1->setIsEarlyClobber(false);
1387
1388 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1389 .add(*Dest0) // Copy to same destination including flags and sub reg.
1390 .addReg(DestReg, 0, SubRegIdx0);
1391 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1392 .add(*Dest1)
1393 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1394}
1395
1396// Return a register for the source of the merged store after copying the
1397// original source regs of CI and Paired into it.
1399SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1400 MachineBasicBlock::iterator InsertBefore,
1401 AMDGPU::OpName OpName) const {
1402 MachineBasicBlock *MBB = CI.I->getParent();
1403 DebugLoc DL = CI.I->getDebugLoc();
1404
1405 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1406
1407 // Copy to the new source register.
1408 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1409 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1410
1411 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1412 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1413
1414 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1415 .add(*Src0)
1416 .addImm(SubRegIdx0)
1417 .add(*Src1)
1418 .addImm(SubRegIdx1);
1419
1420 return SrcReg;
1421}
1422
1423unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1424 if (STM->ldsRequiresM0Init())
1425 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1426 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1427}
1428
1429unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1430 if (STM->ldsRequiresM0Init())
1431 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1432
1433 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1434 : AMDGPU::DS_READ2ST64_B64_gfx9;
1435}
1436
1438SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1439 MachineBasicBlock::iterator InsertBefore) {
1440 MachineBasicBlock *MBB = CI.I->getParent();
1441
1442 // Be careful, since the addresses could be subregisters themselves in weird
1443 // cases, like vectors of pointers.
1444 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1445
1446 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1447 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1448 unsigned Opc =
1449 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1450
1451 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1452 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1453
1454 const MCInstrDesc &Read2Desc = TII->get(Opc);
1455
1456 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1457 Register DestReg = MRI->createVirtualRegister(SuperRC);
1458
1459 DebugLoc DL = CI.I->getDebugLoc();
1460
1461 Register BaseReg = AddrReg->getReg();
1462 unsigned BaseSubReg = AddrReg->getSubReg();
1463 unsigned BaseRegFlags = 0;
1464 if (CI.BaseOff) {
1465 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1466 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1467 .addImm(CI.BaseOff);
1468
1469 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1470 BaseRegFlags = RegState::Kill;
1471
1472 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1473 .addReg(ImmReg)
1474 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1475 .addImm(0); // clamp bit
1476 BaseSubReg = 0;
1477 }
1478
1479 MachineInstrBuilder Read2 =
1480 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1481 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1482 .addImm(NewOffset0) // offset0
1483 .addImm(NewOffset1) // offset1
1484 .addImm(0) // gds
1485 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1486
1487 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1488
1489 CI.I->eraseFromParent();
1490 Paired.I->eraseFromParent();
1491
1492 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1493 return Read2;
1494}
1495
1496unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1497 if (STM->ldsRequiresM0Init())
1498 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1499 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1500 : AMDGPU::DS_WRITE2_B64_gfx9;
1501}
1502
1503unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1504 if (STM->ldsRequiresM0Init())
1505 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1506 : AMDGPU::DS_WRITE2ST64_B64;
1507
1508 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1509 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1510}
1511
1512unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
1513 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1514}
1515
1516MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1517 CombineInfo &CI, CombineInfo &Paired,
1518 MachineBasicBlock::iterator InsertBefore) {
1519 MachineBasicBlock *MBB = CI.I->getParent();
1520
1521 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1522 // sure we preserve the subregister index and any register flags set on them.
1523 const MachineOperand *AddrReg =
1524 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1525 const MachineOperand *Data0 =
1526 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1527 const MachineOperand *Data1 =
1528 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1529
1530 unsigned NewOffset0 = CI.Offset;
1531 unsigned NewOffset1 = Paired.Offset;
1532 unsigned Opc = getWrite2Opcode(CI);
1533
1534 if (NewOffset0 > NewOffset1) {
1535 // Canonicalize the merged instruction so the smaller offset comes first.
1536 std::swap(NewOffset0, NewOffset1);
1537 std::swap(Data0, Data1);
1538 }
1539
1540 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1541 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1542
1543 const MCInstrDesc &Write2Desc = TII->get(Opc);
1544 DebugLoc DL = CI.I->getDebugLoc();
1545
1546 Register BaseReg = AddrReg->getReg();
1547 unsigned BaseSubReg = AddrReg->getSubReg();
1548 unsigned BaseRegFlags = 0;
1549 if (CI.BaseOff) {
1550 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1551 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1552 .addImm(CI.BaseOff);
1553
1554 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1555 BaseRegFlags = RegState::Kill;
1556
1557 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1558 .addReg(ImmReg)
1559 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1560 .addImm(0); // clamp bit
1561 BaseSubReg = 0;
1562 }
1563
1564 MachineInstrBuilder Write2 =
1565 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1566 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1567 .add(*Data0) // data0
1568 .add(*Data1) // data1
1569 .addImm(NewOffset0) // offset0
1570 .addImm(NewOffset1) // offset1
1571 .addImm(0) // gds
1572 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1573
1574 CI.I->eraseFromParent();
1575 Paired.I->eraseFromParent();
1576
1577 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1578 return Write2;
1579}
1580
1582SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1583 MachineBasicBlock::iterator InsertBefore) {
1584 MachineBasicBlock *MBB = CI.I->getParent();
1585 DebugLoc DL = CI.I->getDebugLoc();
1586 const unsigned Opcode = getNewOpcode(CI, Paired);
1587
1588 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1589
1590 Register DestReg = MRI->createVirtualRegister(SuperRC);
1591 unsigned MergedDMask = CI.DMask | Paired.DMask;
1592 unsigned DMaskIdx =
1593 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1594
1595 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1596 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1597 if (I == DMaskIdx)
1598 MIB.addImm(MergedDMask);
1599 else
1600 MIB.add((*CI.I).getOperand(I));
1601 }
1602
1603 // It shouldn't be possible to get this far if the two instructions
1604 // don't have a single memoperand, because MachineInstr::mayAlias()
1605 // will return true if this is the case.
1606 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1607
1608 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1609
1610 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1611
1612 CI.I->eraseFromParent();
1613 Paired.I->eraseFromParent();
1614 return New;
1615}
1616
1617MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1618 CombineInfo &CI, CombineInfo &Paired,
1619 MachineBasicBlock::iterator InsertBefore) {
1620 MachineBasicBlock *MBB = CI.I->getParent();
1621 DebugLoc DL = CI.I->getDebugLoc();
1622 const unsigned Opcode = getNewOpcode(CI, Paired);
1623
1624 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1625
1626 Register DestReg = MRI->createVirtualRegister(SuperRC);
1627 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1628
1629 // It shouldn't be possible to get this far if the two instructions
1630 // don't have a single memoperand, because MachineInstr::mayAlias()
1631 // will return true if this is the case.
1632 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1633
1635 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1636 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1637 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1638 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1639 New.addImm(MergedOffset);
1640 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1641
1642 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1643
1644 CI.I->eraseFromParent();
1645 Paired.I->eraseFromParent();
1646 return New;
1647}
1648
1649MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1650 CombineInfo &CI, CombineInfo &Paired,
1651 MachineBasicBlock::iterator InsertBefore) {
1652 MachineBasicBlock *MBB = CI.I->getParent();
1653 DebugLoc DL = CI.I->getDebugLoc();
1654
1655 const unsigned Opcode = getNewOpcode(CI, Paired);
1656
1657 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1658
1659 // Copy to the new source register.
1660 Register DestReg = MRI->createVirtualRegister(SuperRC);
1661 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1662
1663 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1664
1665 AddressRegs Regs = getRegs(Opcode, *TII);
1666
1667 if (Regs.VAddr)
1668 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1669
1670 // It shouldn't be possible to get this far if the two instructions
1671 // don't have a single memoperand, because MachineInstr::mayAlias()
1672 // will return true if this is the case.
1673 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1674
1675 MachineInstr *New =
1676 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1677 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1678 .addImm(MergedOffset) // offset
1679 .addImm(CI.CPol) // cpol
1680 .addImm(0) // swz
1681 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1682
1683 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1684
1685 CI.I->eraseFromParent();
1686 Paired.I->eraseFromParent();
1687 return New;
1688}
1689
1690MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1691 CombineInfo &CI, CombineInfo &Paired,
1692 MachineBasicBlock::iterator InsertBefore) {
1693 MachineBasicBlock *MBB = CI.I->getParent();
1694 DebugLoc DL = CI.I->getDebugLoc();
1695
1696 const unsigned Opcode = getNewOpcode(CI, Paired);
1697
1698 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1699
1700 // Copy to the new source register.
1701 Register DestReg = MRI->createVirtualRegister(SuperRC);
1702 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1703
1704 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1705
1706 AddressRegs Regs = getRegs(Opcode, *TII);
1707
1708 if (Regs.VAddr)
1709 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1710
1711 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1712 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1713 // and use XYZ of XYZW to enable the merge.
1714 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1715 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1716 NumCombinedComponents = 4;
1717 unsigned JoinedFormat =
1718 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1719
1720 // It shouldn't be possible to get this far if the two instructions
1721 // don't have a single memoperand, because MachineInstr::mayAlias()
1722 // will return true if this is the case.
1723 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1724
1725 MachineInstr *New =
1726 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1727 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1728 .addImm(MergedOffset) // offset
1729 .addImm(JoinedFormat) // format
1730 .addImm(CI.CPol) // cpol
1731 .addImm(0) // swz
1732 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1733
1734 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1735
1736 CI.I->eraseFromParent();
1737 Paired.I->eraseFromParent();
1738 return New;
1739}
1740
1741MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1742 CombineInfo &CI, CombineInfo &Paired,
1743 MachineBasicBlock::iterator InsertBefore) {
1744 MachineBasicBlock *MBB = CI.I->getParent();
1745 DebugLoc DL = CI.I->getDebugLoc();
1746
1747 const unsigned Opcode = getNewOpcode(CI, Paired);
1748
1749 Register SrcReg =
1750 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1751
1752 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1753 .addReg(SrcReg, RegState::Kill);
1754
1755 AddressRegs Regs = getRegs(Opcode, *TII);
1756
1757 if (Regs.VAddr)
1758 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1759
1760 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1761 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1762 // and use XYZ of XYZW to enable the merge.
1763 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1764 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1765 NumCombinedComponents = 4;
1766 unsigned JoinedFormat =
1767 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1768
1769 // It shouldn't be possible to get this far if the two instructions
1770 // don't have a single memoperand, because MachineInstr::mayAlias()
1771 // will return true if this is the case.
1772 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1773
1774 MachineInstr *New =
1775 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1776 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1777 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1778 .addImm(JoinedFormat) // format
1779 .addImm(CI.CPol) // cpol
1780 .addImm(0) // swz
1781 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1782
1783 CI.I->eraseFromParent();
1784 Paired.I->eraseFromParent();
1785 return New;
1786}
1787
1788MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1789 CombineInfo &CI, CombineInfo &Paired,
1790 MachineBasicBlock::iterator InsertBefore) {
1791 MachineBasicBlock *MBB = CI.I->getParent();
1792 DebugLoc DL = CI.I->getDebugLoc();
1793
1794 const unsigned Opcode = getNewOpcode(CI, Paired);
1795
1796 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1797 Register DestReg = MRI->createVirtualRegister(SuperRC);
1798
1799 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1800
1801 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1802 MIB.add(*SAddr);
1803
1804 MachineInstr *New =
1805 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1806 .addImm(std::min(CI.Offset, Paired.Offset))
1807 .addImm(CI.CPol)
1808 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1809
1810 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1811
1812 CI.I->eraseFromParent();
1813 Paired.I->eraseFromParent();
1814 return New;
1815}
1816
1817MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1818 CombineInfo &CI, CombineInfo &Paired,
1819 MachineBasicBlock::iterator InsertBefore) {
1820 MachineBasicBlock *MBB = CI.I->getParent();
1821 DebugLoc DL = CI.I->getDebugLoc();
1822
1823 const unsigned Opcode = getNewOpcode(CI, Paired);
1824
1825 Register SrcReg =
1826 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1827
1828 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1829 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1830 .addReg(SrcReg, RegState::Kill);
1831
1832 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1833 MIB.add(*SAddr);
1834
1835 MachineInstr *New =
1836 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1837 .addImm(CI.CPol)
1838 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1839
1840 CI.I->eraseFromParent();
1841 Paired.I->eraseFromParent();
1842 return New;
1843}
1844
1847 unsigned Width) {
1848 // Conservatively returns true if not found the MMO.
1849 return STM.isXNACKEnabled() &&
1850 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1851}
1852
1853unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1854 const CombineInfo &Paired) {
1855 const unsigned Width = CI.Width + Paired.Width;
1856
1857 switch (getCommonInstClass(CI, Paired)) {
1858 default:
1859 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1860 // FIXME: Handle d16 correctly
1861 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1862 Width);
1863 case TBUFFER_LOAD:
1864 case TBUFFER_STORE:
1865 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1866 Width);
1867
1868 case UNKNOWN:
1869 llvm_unreachable("Unknown instruction class");
1870 case S_BUFFER_LOAD_IMM: {
1871 // If XNACK is enabled, use the constrained opcodes when the first load is
1872 // under-aligned.
1873 bool NeedsConstrainedOpc =
1874 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1875 switch (Width) {
1876 default:
1877 return 0;
1878 case 2:
1879 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1880 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1881 case 3:
1882 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1883 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1884 case 4:
1885 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1886 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1887 case 8:
1888 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1889 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1890 }
1891 }
1892 case S_BUFFER_LOAD_SGPR_IMM: {
1893 // If XNACK is enabled, use the constrained opcodes when the first load is
1894 // under-aligned.
1895 bool NeedsConstrainedOpc =
1896 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1897 switch (Width) {
1898 default:
1899 return 0;
1900 case 2:
1901 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1902 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1903 case 3:
1904 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1905 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1906 case 4:
1907 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1908 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1909 case 8:
1910 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1911 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1912 }
1913 }
1914 case S_LOAD_IMM: {
1915 // If XNACK is enabled, use the constrained opcodes when the first load is
1916 // under-aligned.
1917 bool NeedsConstrainedOpc =
1918 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1919 switch (Width) {
1920 default:
1921 return 0;
1922 case 2:
1923 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1924 : AMDGPU::S_LOAD_DWORDX2_IMM;
1925 case 3:
1926 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1927 : AMDGPU::S_LOAD_DWORDX3_IMM;
1928 case 4:
1929 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1930 : AMDGPU::S_LOAD_DWORDX4_IMM;
1931 case 8:
1932 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1933 : AMDGPU::S_LOAD_DWORDX8_IMM;
1934 }
1935 }
1936 case GLOBAL_LOAD:
1937 switch (Width) {
1938 default:
1939 return 0;
1940 case 2:
1941 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1942 case 3:
1943 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1944 case 4:
1945 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1946 }
1947 case GLOBAL_LOAD_SADDR:
1948 switch (Width) {
1949 default:
1950 return 0;
1951 case 2:
1952 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1953 case 3:
1954 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1955 case 4:
1956 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1957 }
1958 case GLOBAL_STORE:
1959 switch (Width) {
1960 default:
1961 return 0;
1962 case 2:
1963 return AMDGPU::GLOBAL_STORE_DWORDX2;
1964 case 3:
1965 return AMDGPU::GLOBAL_STORE_DWORDX3;
1966 case 4:
1967 return AMDGPU::GLOBAL_STORE_DWORDX4;
1968 }
1969 case GLOBAL_STORE_SADDR:
1970 switch (Width) {
1971 default:
1972 return 0;
1973 case 2:
1974 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1975 case 3:
1976 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1977 case 4:
1978 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1979 }
1980 case FLAT_LOAD:
1981 switch (Width) {
1982 default:
1983 return 0;
1984 case 2:
1985 return AMDGPU::FLAT_LOAD_DWORDX2;
1986 case 3:
1987 return AMDGPU::FLAT_LOAD_DWORDX3;
1988 case 4:
1989 return AMDGPU::FLAT_LOAD_DWORDX4;
1990 }
1991 case FLAT_STORE:
1992 switch (Width) {
1993 default:
1994 return 0;
1995 case 2:
1996 return AMDGPU::FLAT_STORE_DWORDX2;
1997 case 3:
1998 return AMDGPU::FLAT_STORE_DWORDX3;
1999 case 4:
2000 return AMDGPU::FLAT_STORE_DWORDX4;
2001 }
2002 case FLAT_LOAD_SADDR:
2003 switch (Width) {
2004 default:
2005 return 0;
2006 case 2:
2007 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2008 case 3:
2009 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2010 case 4:
2011 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2012 }
2013 case FLAT_STORE_SADDR:
2014 switch (Width) {
2015 default:
2016 return 0;
2017 case 2:
2018 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2019 case 3:
2020 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2021 case 4:
2022 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2023 }
2024 case MIMG:
2025 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
2026 "No overlaps");
2027 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
2028 }
2029}
2030
2031std::pair<unsigned, unsigned>
2032SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
2033 const CombineInfo &Paired) {
2034 assert((CI.InstClass != MIMG ||
2035 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
2036 CI.Width + Paired.Width)) &&
2037 "No overlaps");
2038
2039 unsigned Idx0;
2040 unsigned Idx1;
2041
2042 static const unsigned Idxs[5][4] = {
2043 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2044 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2045 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2046 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2047 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2048 };
2049
2050 assert(CI.Width >= 1 && CI.Width <= 4);
2051 assert(Paired.Width >= 1 && Paired.Width <= 4);
2052
2053 if (Paired < CI) {
2054 Idx1 = Idxs[0][Paired.Width - 1];
2055 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2056 } else {
2057 Idx0 = Idxs[0][CI.Width - 1];
2058 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2059 }
2060
2061 return {Idx0, Idx1};
2062}
2063
2064const TargetRegisterClass *
2065SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
2066 const CombineInfo &Paired) const {
2067 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2068 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2069 switch (CI.Width + Paired.Width) {
2070 default:
2071 return nullptr;
2072 case 2:
2073 return &AMDGPU::SReg_64_XEXECRegClass;
2074 case 3:
2075 return &AMDGPU::SGPR_96RegClass;
2076 case 4:
2077 return &AMDGPU::SGPR_128RegClass;
2078 case 8:
2079 return &AMDGPU::SGPR_256RegClass;
2080 case 16:
2081 return &AMDGPU::SGPR_512RegClass;
2082 }
2083 }
2084
2085 // FIXME: This should compute the instruction to use, and then use the result
2086 // of TII->getRegClass.
2087 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2088 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2089 ? TRI->getAGPRClassForBitWidth(BitWidth)
2090 : TRI->getVGPRClassForBitWidth(BitWidth);
2091}
2092
2093MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
2094 CombineInfo &CI, CombineInfo &Paired,
2095 MachineBasicBlock::iterator InsertBefore) {
2096 MachineBasicBlock *MBB = CI.I->getParent();
2097 DebugLoc DL = CI.I->getDebugLoc();
2098
2099 const unsigned Opcode = getNewOpcode(CI, Paired);
2100
2101 Register SrcReg =
2102 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
2103
2104 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
2105 .addReg(SrcReg, RegState::Kill);
2106
2107 AddressRegs Regs = getRegs(Opcode, *TII);
2108
2109 if (Regs.VAddr)
2110 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2111
2112
2113 // It shouldn't be possible to get this far if the two instructions
2114 // don't have a single memoperand, because MachineInstr::mayAlias()
2115 // will return true if this is the case.
2116 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2117
2118 MachineInstr *New =
2119 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2120 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2121 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
2122 .addImm(CI.CPol) // cpol
2123 .addImm(0) // swz
2124 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2125
2126 CI.I->eraseFromParent();
2127 Paired.I->eraseFromParent();
2128 return New;
2129}
2130
2132SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
2133 APInt V(32, Val, true);
2134 if (TII->isInlineConstant(V))
2135 return MachineOperand::CreateImm(Val);
2136
2137 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2138 MachineInstr *Mov =
2139 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
2140 TII->get(AMDGPU::S_MOV_B32), Reg)
2141 .addImm(Val);
2142 (void)Mov;
2143 LLVM_DEBUG(dbgs() << " "; Mov->dump());
2144 return MachineOperand::CreateReg(Reg, false);
2145}
2146
2147// Compute base address using Addr and return the final register.
2148Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
2149 const MemAddress &Addr) const {
2150 MachineBasicBlock *MBB = MI.getParent();
2151 MachineBasicBlock::iterator MBBI = MI.getIterator();
2152 DebugLoc DL = MI.getDebugLoc();
2153
2154 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2155 Addr.Base.LoSubReg) &&
2156 "Expected 32-bit Base-Register-Low!!");
2157
2158 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2159 Addr.Base.HiSubReg) &&
2160 "Expected 32-bit Base-Register-Hi!!");
2161
2162 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2163 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2164 MachineOperand OffsetHi =
2165 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2166
2167 const auto *CarryRC = TRI->getWaveMaskRegClass();
2168 Register CarryReg = MRI->createVirtualRegister(CarryRC);
2169 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2170
2171 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2172 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2173 MachineInstr *LoHalf =
2174 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2175 .addReg(CarryReg, RegState::Define)
2176 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2177 .add(OffsetLo)
2178 .addImm(0); // clamp bit
2179 (void)LoHalf;
2180 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2181
2182 MachineInstr *HiHalf =
2183 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2184 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2185 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2186 .add(OffsetHi)
2187 .addReg(CarryReg, RegState::Kill)
2188 .addImm(0); // clamp bit
2189 (void)HiHalf;
2190 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2191
2192 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2193 MachineInstr *FullBase =
2194 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2195 .addReg(DestSub0)
2196 .addImm(AMDGPU::sub0)
2197 .addReg(DestSub1)
2198 .addImm(AMDGPU::sub1);
2199 (void)FullBase;
2200 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2201
2202 return FullDestReg;
2203}
2204
2205// Update base and offset with the NewBase and NewOffset in MI.
2206void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2207 Register NewBase,
2208 int32_t NewOffset) const {
2209 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2210 Base->setReg(NewBase);
2211 Base->setIsKill(false);
2212 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2213}
2214
2215std::optional<int32_t>
2216SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2217 if (Op.isImm())
2218 return Op.getImm();
2219
2220 if (!Op.isReg())
2221 return std::nullopt;
2222
2223 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2224 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2225 !Def->getOperand(1).isImm())
2226 return std::nullopt;
2227
2228 return Def->getOperand(1).getImm();
2229}
2230
2231// Analyze Base and extracts:
2232// - 32bit base registers, subregisters
2233// - 64bit constant offset
2234// Expecting base computation as:
2235// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2236// %LO:vgpr_32, %c:sreg_64_xexec =
2237// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2238// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2239// %Base:vreg_64 =
2240// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2241void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2242 MemAddress &Addr) const {
2243 if (!Base.isReg())
2244 return;
2245
2246 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2247 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2248 || Def->getNumOperands() != 5)
2249 return;
2250
2251 MachineOperand BaseLo = Def->getOperand(1);
2252 MachineOperand BaseHi = Def->getOperand(3);
2253 if (!BaseLo.isReg() || !BaseHi.isReg())
2254 return;
2255
2256 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2257 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2258
2259 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2260 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2261 return;
2262
2263 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2264 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2265
2266 auto Offset0P = extractConstOffset(*Src0);
2267 if (Offset0P)
2268 BaseLo = *Src1;
2269 else {
2270 if (!(Offset0P = extractConstOffset(*Src1)))
2271 return;
2272 BaseLo = *Src0;
2273 }
2274
2275 if (!BaseLo.isReg())
2276 return;
2277
2278 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2279 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2280
2281 if (Src0->isImm())
2282 std::swap(Src0, Src1);
2283
2284 if (!Src1->isImm() || Src0->isImm())
2285 return;
2286
2287 uint64_t Offset1 = Src1->getImm();
2288 BaseHi = *Src0;
2289
2290 if (!BaseHi.isReg())
2291 return;
2292
2293 Addr.Base.LoReg = BaseLo.getReg();
2294 Addr.Base.HiReg = BaseHi.getReg();
2295 Addr.Base.LoSubReg = BaseLo.getSubReg();
2296 Addr.Base.HiSubReg = BaseHi.getSubReg();
2297 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2298}
2299
2300bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2302 MemInfoMap &Visited,
2304
2305 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2306 return false;
2307
2308 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2310 return false;
2311
2314
2315 if (AnchorList.count(&MI))
2316 return false;
2317
2318 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2319
2320 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2321 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2322 return false;
2323 }
2324
2325 // Step1: Find the base-registers and a 64bit constant offset.
2326 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2327 auto [It, Inserted] = Visited.try_emplace(&MI);
2328 MemAddress MAddr;
2329 if (Inserted) {
2330 processBaseWithConstOffset(Base, MAddr);
2331 It->second = MAddr;
2332 } else
2333 MAddr = It->second;
2334
2335 if (MAddr.Offset == 0) {
2336 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2337 " constant offsets that can be promoted.\n";);
2338 return false;
2339 }
2340
2341 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2342 << printReg(MAddr.Base.LoReg, TRI)
2343 << "} Offset: " << MAddr.Offset << "\n\n";);
2344
2345 // Step2: Traverse through MI's basic block and find an anchor(that has the
2346 // same base-registers) with the highest 13bit distance from MI's offset.
2347 // E.g. (64bit loads)
2348 // bb:
2349 // addr1 = &a + 4096; load1 = load(addr1, 0)
2350 // addr2 = &a + 6144; load2 = load(addr2, 0)
2351 // addr3 = &a + 8192; load3 = load(addr3, 0)
2352 // addr4 = &a + 10240; load4 = load(addr4, 0)
2353 // addr5 = &a + 12288; load5 = load(addr5, 0)
2354 //
2355 // Starting from the first load, the optimization will try to find a new base
2356 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2357 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2358 // as the new-base(anchor) because of the maximum distance which can
2359 // accommodate more intermediate bases presumably.
2360 //
2361 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2362 // (&a + 8192) for load1, load2, load4.
2363 // addr = &a + 8192
2364 // load1 = load(addr, -4096)
2365 // load2 = load(addr, -2048)
2366 // load3 = load(addr, 0)
2367 // load4 = load(addr, 2048)
2368 // addr5 = &a + 12288; load5 = load(addr5, 0)
2369 //
2370 MachineInstr *AnchorInst = nullptr;
2371 MemAddress AnchorAddr;
2372 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2374
2375 MachineBasicBlock *MBB = MI.getParent();
2377 MachineBasicBlock::iterator MBBI = MI.getIterator();
2378 ++MBBI;
2379 const SITargetLowering *TLI = STM->getTargetLowering();
2380
2381 for ( ; MBBI != E; ++MBBI) {
2382 MachineInstr &MINext = *MBBI;
2383 // TODO: Support finding an anchor(with same base) from store addresses or
2384 // any other load addresses where the opcodes are different.
2385 if (MINext.getOpcode() != MI.getOpcode() ||
2386 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2387 continue;
2388
2389 const MachineOperand &BaseNext =
2390 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2391 MemAddress MAddrNext;
2392 auto [It, Inserted] = Visited.try_emplace(&MINext);
2393 if (Inserted) {
2394 processBaseWithConstOffset(BaseNext, MAddrNext);
2395 It->second = MAddrNext;
2396 } else
2397 MAddrNext = It->second;
2398
2399 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2400 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2401 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2402 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2403 continue;
2404
2405 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2406
2407 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2409 AM.HasBaseReg = true;
2410 AM.BaseOffs = Dist;
2411 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2412 (uint32_t)std::abs(Dist) > MaxDist) {
2413 MaxDist = std::abs(Dist);
2414
2415 AnchorAddr = MAddrNext;
2416 AnchorInst = &MINext;
2417 }
2418 }
2419
2420 if (AnchorInst) {
2421 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2422 AnchorInst->dump());
2423 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2424 << AnchorAddr.Offset << "\n\n");
2425
2426 // Instead of moving up, just re-compute anchor-instruction's base address.
2427 Register Base = computeBase(MI, AnchorAddr);
2428
2429 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2430 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2431
2432 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2434 AM.HasBaseReg = true;
2435 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2436
2437 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2438 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2439 OtherMI->dump());
2440 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2441 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2442 }
2443 }
2444 AnchorList.insert(AnchorInst);
2445 return true;
2446 }
2447
2448 return false;
2449}
2450
2451void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2452 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2453 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2454 if (AddrList.front().InstClass == CI.InstClass &&
2455 AddrList.front().hasSameBaseAddress(CI)) {
2456 AddrList.emplace_back(CI);
2457 return;
2458 }
2459 }
2460
2461 // Base address not found, so add a new list.
2462 MergeableInsts.emplace_back(1, CI);
2463}
2464
2465std::pair<MachineBasicBlock::iterator, bool>
2466SILoadStoreOptimizer::collectMergeableInsts(
2468 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2469 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2470 bool Modified = false;
2471
2472 // Sort potential mergeable instructions into lists. One list per base address.
2473 unsigned Order = 0;
2474 MachineBasicBlock::iterator BlockI = Begin;
2475 for (; BlockI != End; ++BlockI) {
2476 MachineInstr &MI = *BlockI;
2477
2478 // We run this before checking if an address is mergeable, because it can produce
2479 // better code even if the instructions aren't mergeable.
2480 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2481 Modified = true;
2482
2483 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2484 // barriers. We can look after this barrier for separate merges.
2485 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2486 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2487
2488 // Search will resume after this instruction in a separate merge list.
2489 ++BlockI;
2490 break;
2491 }
2492
2493 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2494 if (InstClass == UNKNOWN)
2495 continue;
2496
2497 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2498 int Swizzled =
2499 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2500 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2501 continue;
2502
2503 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2504 const MachineOperand *Fmt =
2505 TII->getNamedOperand(MI, AMDGPU::OpName::format);
2506 if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {
2507 LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
2508 continue;
2509 }
2510 }
2511
2512 CombineInfo CI;
2513 CI.setMI(MI, *this);
2514 CI.Order = Order++;
2515
2516 if (!CI.hasMergeableAddress(*MRI))
2517 continue;
2518
2519 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2520
2521 addInstToMergeableList(CI, MergeableInsts);
2522 }
2523
2524 // At this point we have lists of Mergeable instructions.
2525 //
2526 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2527 // list try to find an instruction that can be merged with I. If an instruction
2528 // is found, it is stored in the Paired field. If no instructions are found, then
2529 // the CombineInfo object is deleted from the list.
2530
2531 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2532 E = MergeableInsts.end(); I != E;) {
2533
2534 std::list<CombineInfo> &MergeList = *I;
2535 if (MergeList.size() <= 1) {
2536 // This means we have found only one instruction with a given address
2537 // that can be merged, and we need at least 2 instructions to do a merge,
2538 // so this list can be discarded.
2539 I = MergeableInsts.erase(I);
2540 continue;
2541 }
2542
2543 // Sort the lists by offsets, this way mergeable instructions will be
2544 // adjacent to each other in the list, which will make it easier to find
2545 // matches.
2546 MergeList.sort(
2547 [] (const CombineInfo &A, const CombineInfo &B) {
2548 return A.Offset < B.Offset;
2549 });
2550 ++I;
2551 }
2552
2553 return {BlockI, Modified};
2554}
2555
2556// Scan through looking for adjacent LDS operations with constant offsets from
2557// the same base register. We rely on the scheduler to do the hard work of
2558// clustering nearby loads, and assume these are all adjacent.
2559bool SILoadStoreOptimizer::optimizeBlock(
2560 std::list<std::list<CombineInfo> > &MergeableInsts) {
2561 bool Modified = false;
2562
2563 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2564 E = MergeableInsts.end(); I != E;) {
2565 std::list<CombineInfo> &MergeList = *I;
2566
2567 bool OptimizeListAgain = false;
2568 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2569 // We weren't able to make any changes, so delete the list so we don't
2570 // process the same instructions the next time we try to optimize this
2571 // block.
2572 I = MergeableInsts.erase(I);
2573 continue;
2574 }
2575
2576 Modified = true;
2577
2578 // We made changes, but also determined that there were no more optimization
2579 // opportunities, so we don't need to reprocess the list
2580 if (!OptimizeListAgain) {
2581 I = MergeableInsts.erase(I);
2582 continue;
2583 }
2584 OptimizeAgain = true;
2585 }
2586 return Modified;
2587}
2588
2589bool
2590SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2591 std::list<CombineInfo> &MergeList,
2592 bool &OptimizeListAgain) {
2593 if (MergeList.empty())
2594 return false;
2595
2596 bool Modified = false;
2597
2598 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2599 Next = std::next(I)) {
2600
2601 auto First = I;
2602 auto Second = Next;
2603
2604 if ((*First).Order > (*Second).Order)
2605 std::swap(First, Second);
2606 CombineInfo &CI = *First;
2607 CombineInfo &Paired = *Second;
2608
2609 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2610 if (!Where) {
2611 ++I;
2612 continue;
2613 }
2614
2615 Modified = true;
2616
2617 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2618
2620 switch (CI.InstClass) {
2621 default:
2622 llvm_unreachable("unknown InstClass");
2623 break;
2624 case DS_READ:
2625 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2626 break;
2627 case DS_WRITE:
2628 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2629 break;
2630 case S_BUFFER_LOAD_IMM:
2631 case S_BUFFER_LOAD_SGPR_IMM:
2632 case S_LOAD_IMM:
2633 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2634 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2635 break;
2636 case BUFFER_LOAD:
2637 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2638 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2639 break;
2640 case BUFFER_STORE:
2641 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2642 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2643 break;
2644 case MIMG:
2645 NewMI = mergeImagePair(CI, Paired, Where->I);
2646 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2647 break;
2648 case TBUFFER_LOAD:
2649 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2650 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2651 break;
2652 case TBUFFER_STORE:
2653 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2654 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2655 break;
2656 case FLAT_LOAD:
2657 case FLAT_LOAD_SADDR:
2658 case GLOBAL_LOAD:
2659 case GLOBAL_LOAD_SADDR:
2660 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2661 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2662 break;
2663 case FLAT_STORE:
2664 case FLAT_STORE_SADDR:
2665 case GLOBAL_STORE:
2666 case GLOBAL_STORE_SADDR:
2667 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2668 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2669 break;
2670 }
2671 CI.setMI(NewMI, *this);
2672 CI.Order = Where->Order;
2673 if (I == Second)
2674 I = Next;
2675
2676 MergeList.erase(Second);
2677 }
2678
2679 return Modified;
2680}
2681
2682bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2683 if (skipFunction(MF.getFunction()))
2684 return false;
2685 return SILoadStoreOptimizer(
2686 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2687 .run(MF);
2688}
2689
2690bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2691 this->MF = &MF;
2692 STM = &MF.getSubtarget<GCNSubtarget>();
2693 if (!STM->loadStoreOptEnabled())
2694 return false;
2695
2696 TII = STM->getInstrInfo();
2697 TRI = &TII->getRegisterInfo();
2698
2699 MRI = &MF.getRegInfo();
2700
2701 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2702
2703 bool Modified = false;
2704
2705 // Contains the list of instructions for which constant offsets are being
2706 // promoted to the IMM. This is tracked for an entire block at time.
2708 MemInfoMap Visited;
2709
2710 for (MachineBasicBlock &MBB : MF) {
2711 MachineBasicBlock::iterator SectionEnd;
2712 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2713 I = SectionEnd) {
2714 bool CollectModified;
2715 std::list<std::list<CombineInfo>> MergeableInsts;
2716
2717 // First pass: Collect list of all instructions we know how to merge in a
2718 // subset of the block.
2719 std::tie(SectionEnd, CollectModified) =
2720 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2721
2722 Modified |= CollectModified;
2723
2724 do {
2725 OptimizeAgain = false;
2726 Modified |= optimizeBlock(MergeableInsts);
2727 } while (OptimizeAgain);
2728 }
2729
2730 Visited.clear();
2731 AnchorList.clear();
2732 }
2733
2734 return Modified;
2735}
2736
2740 MFPropsModifier _(*this, MF);
2741
2742 if (MF.getFunction().hasOptNone())
2743 return PreservedAnalyses::all();
2744
2746 .getManager();
2748
2749 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2750 if (!Changed)
2751 return PreservedAnalyses::all();
2752
2755 return PA;
2756}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
#define LLVM_DEBUG(...)
Definition: Debug.h:119
support::ulittle16_t & Lo
Definition: aarch32.cpp:205
support::ulittle16_t & Hi
Definition: aarch32.cpp:204
A manager for alias analyses.
LLVM_ABI Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
Definition: APInt.h:78
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:700
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:679
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:309
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:317
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:758
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:657
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition: MCInstrDesc.h:231
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition: Pass.cpp:146
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:660
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition: SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:362
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...