LLVM 22.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
64#include "SIDefines.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "si-load-store-opt"
72
73namespace {
74enum InstClassEnum {
75 UNKNOWN,
76 DS_READ,
77 DS_WRITE,
78 S_BUFFER_LOAD_IMM,
79 S_BUFFER_LOAD_SGPR_IMM,
80 S_LOAD_IMM,
81 BUFFER_LOAD,
82 BUFFER_STORE,
83 MIMG,
84 TBUFFER_LOAD,
85 TBUFFER_STORE,
86 GLOBAL_LOAD_SADDR,
87 GLOBAL_STORE_SADDR,
88 FLAT_LOAD,
89 FLAT_STORE,
90 FLAT_LOAD_SADDR,
91 FLAT_STORE_SADDR,
92 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
93 GLOBAL_STORE // any CombineInfo, they are only ever returned by
94 // getCommonInstClass.
95};
96
97struct AddressRegs {
98 unsigned char NumVAddrs = 0;
99 bool SBase = false;
100 bool SRsrc = false;
101 bool SOffset = false;
102 bool SAddr = false;
103 bool VAddr = false;
104 bool Addr = false;
105 bool SSamp = false;
106};
107
108// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
109const unsigned MaxAddressRegs = 12 + 1 + 1;
110
111class SILoadStoreOptimizer {
112 struct CombineInfo {
114 unsigned EltSize;
115 unsigned Offset;
116 unsigned Width;
117 unsigned Format;
118 unsigned BaseOff;
119 unsigned DMask;
120 InstClassEnum InstClass;
121 unsigned CPol = 0;
122 bool IsAGPR;
123 bool UseST64;
124 int AddrIdx[MaxAddressRegs];
125 const MachineOperand *AddrReg[MaxAddressRegs];
126 unsigned NumAddresses;
127 unsigned Order;
128
129 bool hasSameBaseAddress(const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
131 return false;
132
133 const MachineInstr &MI = *CI.I;
134 for (unsigned i = 0; i < NumAddresses; i++) {
135 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
136
137 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
139 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
140 return false;
141 }
142 continue;
143 }
144
145 // Check same base pointer. Be careful of subregisters, which can occur
146 // with vectors of pointers.
147 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
148 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
149 return false;
150 }
151 }
152 return true;
153 }
154
155 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
156 for (unsigned i = 0; i < NumAddresses; ++i) {
157 const MachineOperand *AddrOp = AddrReg[i];
158 // Immediates are always OK.
159 if (AddrOp->isImm())
160 continue;
161
162 // Don't try to merge addresses that aren't either immediates or registers.
163 // TODO: Should be possible to merge FrameIndexes and maybe some other
164 // non-register
165 if (!AddrOp->isReg())
166 return false;
167
168 // TODO: We should be able to merge instructions with other physical reg
169 // addresses too.
170 if (AddrOp->getReg().isPhysical() &&
171 AddrOp->getReg() != AMDGPU::SGPR_NULL)
172 return false;
173
174 // If an address has only one use then there will be no other
175 // instructions with the same address, so we can't merge this one.
176 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
177 return false;
178 }
179 return true;
180 }
181
182 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
183
184 // Compare by pointer order.
185 bool operator<(const CombineInfo& Other) const {
186 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
187 }
188 };
189
190 struct BaseRegisters {
191 Register LoReg;
192 Register HiReg;
193
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
196 };
197
198 struct MemAddress {
199 BaseRegisters Base;
200 int64_t Offset = 0;
201 };
202
203 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
204
205private:
206 const GCNSubtarget *STM = nullptr;
207 const SIInstrInfo *TII = nullptr;
208 const SIRegisterInfo *TRI = nullptr;
209 MachineRegisterInfo *MRI = nullptr;
210 AliasAnalysis *AA = nullptr;
211 bool OptimizeAgain;
212
213 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
214 const DenseSet<Register> &ARegUses,
215 const MachineInstr &A, const MachineInstr &B) const;
216 static bool dmasksCanBeCombined(const CombineInfo &CI,
217 const SIInstrInfo &TII,
218 const CombineInfo &Paired);
219 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
220 CombineInfo &Paired, bool Modify = false);
221 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
222 const CombineInfo &Paired);
223 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
224 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
225 const CombineInfo &Paired);
226 const TargetRegisterClass *
227 getTargetRegisterClass(const CombineInfo &CI,
228 const CombineInfo &Paired) const;
229 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
230
231 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
232
233 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
234 MachineBasicBlock::iterator InsertBefore,
235 AMDGPU::OpName OpName, Register DestReg) const;
236 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
237 MachineBasicBlock::iterator InsertBefore,
238 AMDGPU::OpName OpName) const;
239
240 unsigned read2Opcode(unsigned EltSize) const;
241 unsigned read2ST64Opcode(unsigned EltSize) const;
243 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
244 MachineBasicBlock::iterator InsertBefore);
245
246 unsigned write2Opcode(unsigned EltSize) const;
247 unsigned write2ST64Opcode(unsigned EltSize) const;
249 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
250 MachineBasicBlock::iterator InsertBefore);
252 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
253 MachineBasicBlock::iterator InsertBefore);
255 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
258 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
261 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
264 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
265 MachineBasicBlock::iterator InsertBefore);
267 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
268 MachineBasicBlock::iterator InsertBefore);
270 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
271 MachineBasicBlock::iterator InsertBefore);
273 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
274 MachineBasicBlock::iterator InsertBefore);
275
276 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
277 int32_t NewOffset) const;
278 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
279 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
280 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
281 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
282 /// Promotes constant offset to the immediate by adjusting the base. It
283 /// tries to use a base from the nearby instructions that allows it to have
284 /// a 13bit constant offset which gets promoted to the immediate.
285 bool promoteConstantOffsetToImm(MachineInstr &CI,
286 MemInfoMap &Visited,
287 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
288 void addInstToMergeableList(const CombineInfo &CI,
289 std::list<std::list<CombineInfo> > &MergeableInsts) const;
290
291 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
293 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
294 std::list<std::list<CombineInfo>> &MergeableInsts) const;
295
296 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
297 const CombineInfo &Paired);
298
299 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
300 const CombineInfo &Paired);
301
302 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
303 bool &OptimizeListAgain);
304 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
305
306public:
307 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
308 bool run(MachineFunction &MF);
309};
310
311class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
312public:
313 static char ID;
314
315 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
316
317 bool runOnMachineFunction(MachineFunction &MF) override;
318
319 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
320
321 void getAnalysisUsage(AnalysisUsage &AU) const override {
322 AU.setPreservesCFG();
324
326 }
327
329 return MachineFunctionProperties().setIsSSA();
330 }
331};
332
333static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
334 const unsigned Opc = MI.getOpcode();
335
336 if (TII.isMUBUF(Opc)) {
337 // FIXME: Handle d16 correctly
339 }
340 if (TII.isImage(MI)) {
341 uint64_t DMaskImm =
342 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
343 return llvm::popcount(DMaskImm);
344 }
345 if (TII.isMTBUF(Opc)) {
347 }
348
349 switch (Opc) {
350 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
351 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
352 case AMDGPU::S_LOAD_DWORD_IMM:
353 case AMDGPU::GLOBAL_LOAD_DWORD:
354 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
355 case AMDGPU::GLOBAL_STORE_DWORD:
356 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
357 case AMDGPU::FLAT_LOAD_DWORD:
358 case AMDGPU::FLAT_STORE_DWORD:
359 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
360 case AMDGPU::FLAT_STORE_DWORD_SADDR:
361 return 1;
362 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
363 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
366 case AMDGPU::S_LOAD_DWORDX2_IMM:
367 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
368 case AMDGPU::GLOBAL_LOAD_DWORDX2:
369 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX2:
371 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX2:
373 case AMDGPU::FLAT_STORE_DWORDX2:
374 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
375 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
376 return 2;
377 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
378 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
379 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
380 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
381 case AMDGPU::S_LOAD_DWORDX3_IMM:
382 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
383 case AMDGPU::GLOBAL_LOAD_DWORDX3:
384 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
385 case AMDGPU::GLOBAL_STORE_DWORDX3:
386 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
387 case AMDGPU::FLAT_LOAD_DWORDX3:
388 case AMDGPU::FLAT_STORE_DWORDX3:
389 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
390 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
391 return 3;
392 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
393 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
394 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
395 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
396 case AMDGPU::S_LOAD_DWORDX4_IMM:
397 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
398 case AMDGPU::GLOBAL_LOAD_DWORDX4:
399 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
400 case AMDGPU::GLOBAL_STORE_DWORDX4:
401 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
402 case AMDGPU::FLAT_LOAD_DWORDX4:
403 case AMDGPU::FLAT_STORE_DWORDX4:
404 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
405 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
406 return 4;
407 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
408 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
409 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
410 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
411 case AMDGPU::S_LOAD_DWORDX8_IMM:
412 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
413 return 8;
414 case AMDGPU::DS_READ_B32:
415 case AMDGPU::DS_READ_B32_gfx9:
416 case AMDGPU::DS_WRITE_B32:
417 case AMDGPU::DS_WRITE_B32_gfx9:
418 return 1;
419 case AMDGPU::DS_READ_B64:
420 case AMDGPU::DS_READ_B64_gfx9:
421 case AMDGPU::DS_WRITE_B64:
422 case AMDGPU::DS_WRITE_B64_gfx9:
423 return 2;
424 default:
425 return 0;
426 }
427}
428
429/// Maps instruction opcode to enum InstClassEnum.
430static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
431 switch (Opc) {
432 default:
433 if (TII.isMUBUF(Opc)) {
435 default:
436 return UNKNOWN;
437 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
438 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
439 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
440 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
441 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
442 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
443 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
444 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
445 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
446 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
447 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
448 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
453 return BUFFER_LOAD;
454 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
455 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
456 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
457 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
458 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
459 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
460 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
461 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
462 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
463 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
464 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
465 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
470 return BUFFER_STORE;
471 }
472 }
473 if (TII.isImage(Opc)) {
474 // Ignore instructions encoded without vaddr.
475 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
476 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
477 return UNKNOWN;
478 // Ignore BVH instructions
480 return UNKNOWN;
481 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
482 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
483 TII.isGather4(Opc))
484 return UNKNOWN;
485 return MIMG;
486 }
487 if (TII.isMTBUF(Opc)) {
489 default:
490 return UNKNOWN;
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
507 return TBUFFER_LOAD;
508 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
509 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
510 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
511 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
512 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
513 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
514 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
515 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
516 return TBUFFER_STORE;
517 }
518 }
519 return UNKNOWN;
520 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
529 return S_BUFFER_LOAD_IMM;
530 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
538 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
539 return S_BUFFER_LOAD_SGPR_IMM;
540 case AMDGPU::S_LOAD_DWORD_IMM:
541 case AMDGPU::S_LOAD_DWORDX2_IMM:
542 case AMDGPU::S_LOAD_DWORDX3_IMM:
543 case AMDGPU::S_LOAD_DWORDX4_IMM:
544 case AMDGPU::S_LOAD_DWORDX8_IMM:
545 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
546 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
547 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
548 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
549 return S_LOAD_IMM;
550 case AMDGPU::DS_READ_B32:
551 case AMDGPU::DS_READ_B32_gfx9:
552 case AMDGPU::DS_READ_B64:
553 case AMDGPU::DS_READ_B64_gfx9:
554 return DS_READ;
555 case AMDGPU::DS_WRITE_B32:
556 case AMDGPU::DS_WRITE_B32_gfx9:
557 case AMDGPU::DS_WRITE_B64:
558 case AMDGPU::DS_WRITE_B64_gfx9:
559 return DS_WRITE;
560 case AMDGPU::GLOBAL_LOAD_DWORD:
561 case AMDGPU::GLOBAL_LOAD_DWORDX2:
562 case AMDGPU::GLOBAL_LOAD_DWORDX3:
563 case AMDGPU::GLOBAL_LOAD_DWORDX4:
564 case AMDGPU::FLAT_LOAD_DWORD:
565 case AMDGPU::FLAT_LOAD_DWORDX2:
566 case AMDGPU::FLAT_LOAD_DWORDX3:
567 case AMDGPU::FLAT_LOAD_DWORDX4:
568 return FLAT_LOAD;
569 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
570 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
571 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
572 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
573 return GLOBAL_LOAD_SADDR;
574 case AMDGPU::GLOBAL_STORE_DWORD:
575 case AMDGPU::GLOBAL_STORE_DWORDX2:
576 case AMDGPU::GLOBAL_STORE_DWORDX3:
577 case AMDGPU::GLOBAL_STORE_DWORDX4:
578 case AMDGPU::FLAT_STORE_DWORD:
579 case AMDGPU::FLAT_STORE_DWORDX2:
580 case AMDGPU::FLAT_STORE_DWORDX3:
581 case AMDGPU::FLAT_STORE_DWORDX4:
582 return FLAT_STORE;
583 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
584 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
585 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
586 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
587 return GLOBAL_STORE_SADDR;
588 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
589 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
590 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
591 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
592 return FLAT_LOAD_SADDR;
593 case AMDGPU::FLAT_STORE_DWORD_SADDR:
594 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
595 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
596 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
597 return FLAT_STORE_SADDR;
598 }
599}
600
601/// Determines instruction subclass from opcode. Only instructions
602/// of the same subclass can be merged together. The merged instruction may have
603/// a different subclass but must have the same class.
604static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
605 switch (Opc) {
606 default:
607 if (TII.isMUBUF(Opc))
609 if (TII.isImage(Opc)) {
611 assert(Info);
612 return Info->BaseOpcode;
613 }
614 if (TII.isMTBUF(Opc))
616 return -1;
617 case AMDGPU::DS_READ_B32:
618 case AMDGPU::DS_READ_B32_gfx9:
619 case AMDGPU::DS_READ_B64:
620 case AMDGPU::DS_READ_B64_gfx9:
621 case AMDGPU::DS_WRITE_B32:
622 case AMDGPU::DS_WRITE_B32_gfx9:
623 case AMDGPU::DS_WRITE_B64:
624 case AMDGPU::DS_WRITE_B64_gfx9:
625 return Opc;
626 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
627 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
628 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
629 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
630 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
631 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
632 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
635 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
636 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
644 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
645 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
646 case AMDGPU::S_LOAD_DWORD_IMM:
647 case AMDGPU::S_LOAD_DWORDX2_IMM:
648 case AMDGPU::S_LOAD_DWORDX3_IMM:
649 case AMDGPU::S_LOAD_DWORDX4_IMM:
650 case AMDGPU::S_LOAD_DWORDX8_IMM:
651 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
652 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
653 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
654 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
655 return AMDGPU::S_LOAD_DWORD_IMM;
656 case AMDGPU::GLOBAL_LOAD_DWORD:
657 case AMDGPU::GLOBAL_LOAD_DWORDX2:
658 case AMDGPU::GLOBAL_LOAD_DWORDX3:
659 case AMDGPU::GLOBAL_LOAD_DWORDX4:
660 case AMDGPU::FLAT_LOAD_DWORD:
661 case AMDGPU::FLAT_LOAD_DWORDX2:
662 case AMDGPU::FLAT_LOAD_DWORDX3:
663 case AMDGPU::FLAT_LOAD_DWORDX4:
664 return AMDGPU::FLAT_LOAD_DWORD;
665 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
666 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
667 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
668 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
669 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
670 case AMDGPU::GLOBAL_STORE_DWORD:
671 case AMDGPU::GLOBAL_STORE_DWORDX2:
672 case AMDGPU::GLOBAL_STORE_DWORDX3:
673 case AMDGPU::GLOBAL_STORE_DWORDX4:
674 case AMDGPU::FLAT_STORE_DWORD:
675 case AMDGPU::FLAT_STORE_DWORDX2:
676 case AMDGPU::FLAT_STORE_DWORDX3:
677 case AMDGPU::FLAT_STORE_DWORDX4:
678 return AMDGPU::FLAT_STORE_DWORD;
679 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
680 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
681 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
682 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
683 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
684 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
685 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
686 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
687 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
688 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
689 case AMDGPU::FLAT_STORE_DWORD_SADDR:
690 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
691 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
692 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
693 return AMDGPU::FLAT_STORE_DWORD_SADDR;
694 }
695}
696
697// GLOBAL loads and stores are classified as FLAT initially. If both combined
698// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
699// If either or both instructions are non segment specific FLAT the resulting
700// combined operation will be FLAT, potentially promoting one of the GLOBAL
701// operations to FLAT.
702// For other instructions return the original unmodified class.
703InstClassEnum
704SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
705 const CombineInfo &Paired) {
706 assert(CI.InstClass == Paired.InstClass);
707
708 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
710 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
711
712 return CI.InstClass;
713}
714
715static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
716 AddressRegs Result;
717
718 if (TII.isMUBUF(Opc)) {
720 Result.VAddr = true;
722 Result.SRsrc = true;
724 Result.SOffset = true;
725
726 return Result;
727 }
728
729 if (TII.isImage(Opc)) {
730 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
731 if (VAddr0Idx >= 0) {
732 AMDGPU::OpName RsrcName =
733 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
734 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
735 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
736 } else {
737 Result.VAddr = true;
738 }
739 Result.SRsrc = true;
741 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
742 Result.SSamp = true;
743
744 return Result;
745 }
746 if (TII.isMTBUF(Opc)) {
748 Result.VAddr = true;
750 Result.SRsrc = true;
752 Result.SOffset = true;
753
754 return Result;
755 }
756
757 switch (Opc) {
758 default:
759 return Result;
760 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
761 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
762 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
763 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
764 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
765 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
766 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
767 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
768 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
769 Result.SOffset = true;
770 [[fallthrough]];
771 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
773 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
774 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
778 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
779 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
780 case AMDGPU::S_LOAD_DWORD_IMM:
781 case AMDGPU::S_LOAD_DWORDX2_IMM:
782 case AMDGPU::S_LOAD_DWORDX3_IMM:
783 case AMDGPU::S_LOAD_DWORDX4_IMM:
784 case AMDGPU::S_LOAD_DWORDX8_IMM:
785 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
786 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
787 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
788 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
789 Result.SBase = true;
790 return Result;
791 case AMDGPU::DS_READ_B32:
792 case AMDGPU::DS_READ_B64:
793 case AMDGPU::DS_READ_B32_gfx9:
794 case AMDGPU::DS_READ_B64_gfx9:
795 case AMDGPU::DS_WRITE_B32:
796 case AMDGPU::DS_WRITE_B64:
797 case AMDGPU::DS_WRITE_B32_gfx9:
798 case AMDGPU::DS_WRITE_B64_gfx9:
799 Result.Addr = true;
800 return Result;
801 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
802 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
803 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
804 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
805 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
806 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
807 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
808 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
809 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
810 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
811 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
812 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
813 case AMDGPU::FLAT_STORE_DWORD_SADDR:
814 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
815 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
816 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
817 Result.SAddr = true;
818 [[fallthrough]];
819 case AMDGPU::GLOBAL_LOAD_DWORD:
820 case AMDGPU::GLOBAL_LOAD_DWORDX2:
821 case AMDGPU::GLOBAL_LOAD_DWORDX3:
822 case AMDGPU::GLOBAL_LOAD_DWORDX4:
823 case AMDGPU::GLOBAL_STORE_DWORD:
824 case AMDGPU::GLOBAL_STORE_DWORDX2:
825 case AMDGPU::GLOBAL_STORE_DWORDX3:
826 case AMDGPU::GLOBAL_STORE_DWORDX4:
827 case AMDGPU::FLAT_LOAD_DWORD:
828 case AMDGPU::FLAT_LOAD_DWORDX2:
829 case AMDGPU::FLAT_LOAD_DWORDX3:
830 case AMDGPU::FLAT_LOAD_DWORDX4:
831 case AMDGPU::FLAT_STORE_DWORD:
832 case AMDGPU::FLAT_STORE_DWORDX2:
833 case AMDGPU::FLAT_STORE_DWORDX3:
834 case AMDGPU::FLAT_STORE_DWORDX4:
835 Result.VAddr = true;
836 return Result;
837 }
838}
839
840void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
841 const SILoadStoreOptimizer &LSO) {
842 I = MI;
843 unsigned Opc = MI->getOpcode();
844 InstClass = getInstClass(Opc, *LSO.TII);
845
846 if (InstClass == UNKNOWN)
847 return;
848
849 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
850
851 switch (InstClass) {
852 case DS_READ:
853 EltSize =
854 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
855 : 4;
856 break;
857 case DS_WRITE:
858 EltSize =
859 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
860 : 4;
861 break;
862 case S_BUFFER_LOAD_IMM:
863 case S_BUFFER_LOAD_SGPR_IMM:
864 case S_LOAD_IMM:
865 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
866 break;
867 default:
868 EltSize = 4;
869 break;
870 }
871
872 if (InstClass == MIMG) {
873 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
874 // Offset is not considered for MIMG instructions.
875 Offset = 0;
876 } else {
877 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
878 Offset = I->getOperand(OffsetIdx).getImm();
879 }
880
881 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
882 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
884 AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);
885 EltSize = Info->BitsPerComp / 8;
886 }
887
888 Width = getOpcodeWidth(*I, *LSO.TII);
889
890 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
891 Offset &= 0xffff;
892 } else if (InstClass != MIMG) {
893 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
894 }
895
896 AddressRegs Regs = getRegs(Opc, *LSO.TII);
897 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
898
899 NumAddresses = 0;
900 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
901 AddrIdx[NumAddresses++] =
902 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
903 if (Regs.Addr)
904 AddrIdx[NumAddresses++] =
905 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
906 if (Regs.SBase)
907 AddrIdx[NumAddresses++] =
908 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
909 if (Regs.SRsrc)
910 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
911 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
912 if (Regs.SOffset)
913 AddrIdx[NumAddresses++] =
914 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
915 if (Regs.SAddr)
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
918 if (Regs.VAddr)
919 AddrIdx[NumAddresses++] =
920 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
921 if (Regs.SSamp)
922 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
923 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
924 assert(NumAddresses <= MaxAddressRegs);
925
926 for (unsigned J = 0; J < NumAddresses; J++)
927 AddrReg[J] = &I->getOperand(AddrIdx[J]);
928}
929
930} // end anonymous namespace.
931
932INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
933 "SI Load Store Optimizer", false, false)
935INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
936 "SI Load Store Optimizer", false, false)
937
938char SILoadStoreOptimizerLegacy::ID = 0;
939
940char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
941
943 return new SILoadStoreOptimizerLegacy();
944}
945
947 DenseSet<Register> &RegDefs,
948 DenseSet<Register> &RegUses) {
949 for (const auto &Op : MI.operands()) {
950 if (!Op.isReg())
951 continue;
952 if (Op.isDef())
953 RegDefs.insert(Op.getReg());
954 if (Op.readsReg())
955 RegUses.insert(Op.getReg());
956 }
957}
958
959bool SILoadStoreOptimizer::canSwapInstructions(
960 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
961 const MachineInstr &A, const MachineInstr &B) const {
962 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
963 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
964 return false;
965 for (const auto &BOp : B.operands()) {
966 if (!BOp.isReg())
967 continue;
968 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
969 return false;
970 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
971 return false;
972 }
973 return true;
974}
975
976// Given that \p CI and \p Paired are adjacent memory operations produce a new
977// MMO for the combined operation with a new access size.
979SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
980 const CombineInfo &Paired) {
981 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
982 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
983
984 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
985
986 // A base pointer for the combined operation is the same as the leading
987 // operation's pointer.
988 if (Paired < CI)
989 std::swap(MMOa, MMOb);
990
991 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
992 // If merging FLAT and GLOBAL set address space to FLAT.
994 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
995
996 MachineFunction *MF = CI.I->getMF();
997 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
998}
999
1000bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
1001 const SIInstrInfo &TII,
1002 const CombineInfo &Paired) {
1003 assert(CI.InstClass == MIMG);
1004
1005 // Ignore instructions with tfe/lwe set.
1006 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1007 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1008
1009 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1010 return false;
1011
1012 // Check other optional immediate operands for equality.
1013 AMDGPU::OpName OperandsToMatch[] = {
1014 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1015 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1016
1017 for (AMDGPU::OpName op : OperandsToMatch) {
1018 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
1019 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
1020 return false;
1021 if (Idx != -1 &&
1022 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1023 return false;
1024 }
1025
1026 // Check DMask for overlaps.
1027 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1028 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1029
1030 if (!MaxMask)
1031 return false;
1032
1033 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
1034 if ((1u << AllowedBitsForMin) <= MinMask)
1035 return false;
1036
1037 return true;
1038}
1039
1040static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
1041 unsigned ComponentCount,
1042 const GCNSubtarget &STI) {
1043 if (ComponentCount > 4)
1044 return 0;
1045
1046 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1048 if (!OldFormatInfo)
1049 return 0;
1050
1051 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1053 ComponentCount,
1054 OldFormatInfo->NumFormat, STI);
1055
1056 if (!NewFormatInfo)
1057 return 0;
1058
1059 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1060 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1061
1062 return NewFormatInfo->Format;
1063}
1064
1065// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1066// highest power of two. Note that the result is well defined for all inputs
1067// including corner cases like:
1068// - if Lo == Hi, return that value
1069// - if Lo == 0, return 0 (even though the "- 1" below underflows
1070// - if Lo > Hi, return 0 (as if the range wrapped around)
1072 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
1073}
1074
1075bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1076 const GCNSubtarget &STI,
1077 CombineInfo &Paired,
1078 bool Modify) {
1079 assert(CI.InstClass != MIMG);
1080
1081 // XXX - Would the same offset be OK? Is there any reason this would happen or
1082 // be useful?
1083 if (CI.Offset == Paired.Offset)
1084 return false;
1085
1086 // This won't be valid if the offset isn't aligned.
1087 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1088 return false;
1089
1090 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1091
1095 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1096
1097 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1098 Info0->NumFormat != Info1->NumFormat)
1099 return false;
1100
1101 // For 8-bit or 16-bit formats there is no 3-component variant.
1102 // If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1103 // Example:
1104 // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1105 // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1106 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1107 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1108 NumCombinedComponents = 4;
1109
1110 if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
1111 0)
1112 return false;
1113
1114 // Merge only when the two access ranges are strictly back-to-back,
1115 // any gap or overlap can over-write data or leave holes.
1116 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1117 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1118 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1119 ElemIndex1 + Paired.Width != ElemIndex0)
1120 return false;
1121
1122 // 1-byte formats require 1-byte alignment.
1123 // 2-byte formats require 2-byte alignment.
1124 // 4-byte and larger formats require 4-byte alignment.
1125 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1126 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1127 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1128 if (MinOff % RequiredAlign != 0)
1129 return false;
1130
1131 return true;
1132 }
1133
1134 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1135 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1136 CI.UseST64 = false;
1137 CI.BaseOff = 0;
1138
1139 // Handle all non-DS instructions.
1140 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1141 if (EltOffset0 + CI.Width != EltOffset1 &&
1142 EltOffset1 + Paired.Width != EltOffset0)
1143 return false;
1144 // Instructions with scale_offset modifier cannot be combined unless we
1145 // also generate a code to scale the offset and reset that bit.
1146 if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
1147 return false;
1148 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1149 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1150 // Reject cases like:
1151 // dword + dwordx2 -> dwordx3
1152 // dword + dwordx3 -> dwordx4
1153 // If we tried to combine these cases, we would fail to extract a subreg
1154 // for the result of the second load due to SGPR alignment requirements.
1155 if (CI.Width != Paired.Width &&
1156 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1157 return false;
1158 }
1159 return true;
1160 }
1161
1162 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1163 // the stride 64 versions.
1164 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1165 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1166 if (Modify) {
1167 CI.Offset = EltOffset0 / 64;
1168 Paired.Offset = EltOffset1 / 64;
1169 CI.UseST64 = true;
1170 }
1171 return true;
1172 }
1173
1174 // Check if the new offsets fit in the reduced 8-bit range.
1175 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1176 if (Modify) {
1177 CI.Offset = EltOffset0;
1178 Paired.Offset = EltOffset1;
1179 }
1180 return true;
1181 }
1182
1183 // Try to shift base address to decrease offsets.
1184 uint32_t Min = std::min(EltOffset0, EltOffset1);
1185 uint32_t Max = std::max(EltOffset0, EltOffset1);
1186
1187 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1188 if (((Max - Min) & ~Mask) == 0) {
1189 if (Modify) {
1190 // From the range of values we could use for BaseOff, choose the one that
1191 // is aligned to the highest power of two, to maximise the chance that
1192 // the same offset can be reused for other load/store pairs.
1193 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1194 // Copy the low bits of the offsets, so that when we adjust them by
1195 // subtracting BaseOff they will be multiples of 64.
1196 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1197 CI.BaseOff = BaseOff * CI.EltSize;
1198 CI.Offset = (EltOffset0 - BaseOff) / 64;
1199 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1200 CI.UseST64 = true;
1201 }
1202 return true;
1203 }
1204
1205 if (isUInt<8>(Max - Min)) {
1206 if (Modify) {
1207 // From the range of values we could use for BaseOff, choose the one that
1208 // is aligned to the highest power of two, to maximise the chance that
1209 // the same offset can be reused for other load/store pairs.
1210 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1211 CI.BaseOff = BaseOff * CI.EltSize;
1212 CI.Offset = EltOffset0 - BaseOff;
1213 Paired.Offset = EltOffset1 - BaseOff;
1214 }
1215 return true;
1216 }
1217
1218 return false;
1219}
1220
1221bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1222 const CombineInfo &CI,
1223 const CombineInfo &Paired) {
1224 const unsigned Width = (CI.Width + Paired.Width);
1225 switch (CI.InstClass) {
1226 default:
1227 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1228 case S_BUFFER_LOAD_IMM:
1229 case S_BUFFER_LOAD_SGPR_IMM:
1230 case S_LOAD_IMM:
1231 switch (Width) {
1232 default:
1233 return false;
1234 case 2:
1235 case 4:
1236 case 8:
1237 return true;
1238 case 3:
1239 return STM.hasScalarDwordx3Loads();
1240 }
1241 }
1242}
1243
1244const TargetRegisterClass *
1245SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1246 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1247 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1248 }
1249 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1250 return TRI->getRegClassForReg(*MRI, Src->getReg());
1251 }
1252 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1253 return TRI->getRegClassForReg(*MRI, Src->getReg());
1254 }
1255 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1256 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1257 }
1258 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1259 return TRI->getRegClassForReg(*MRI, Src->getReg());
1260 }
1261 return nullptr;
1262}
1263
1264/// This function assumes that CI comes before Paired in a basic block. Return
1265/// an insertion point for the merged instruction or nullptr on failure.
1266SILoadStoreOptimizer::CombineInfo *
1267SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1268 CombineInfo &Paired) {
1269 // If another instruction has already been merged into CI, it may now be a
1270 // type that we can't do any further merging into.
1271 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1272 return nullptr;
1273 assert(CI.InstClass == Paired.InstClass);
1274
1275 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1276 getInstSubclass(Paired.I->getOpcode(), *TII))
1277 return nullptr;
1278
1279 // Check both offsets (or masks for MIMG) can be combined and fit in the
1280 // reduced range.
1281 if (CI.InstClass == MIMG) {
1282 if (!dmasksCanBeCombined(CI, *TII, Paired))
1283 return nullptr;
1284 } else {
1285 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1286 return nullptr;
1287 }
1288
1289 DenseSet<Register> RegDefs;
1290 DenseSet<Register> RegUses;
1291 CombineInfo *Where;
1292 if (CI.I->mayLoad()) {
1293 // Try to hoist Paired up to CI.
1294 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1295 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1296 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1297 return nullptr;
1298 }
1299 Where = &CI;
1300 } else {
1301 // Try to sink CI down to Paired.
1302 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1303 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1304 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1305 return nullptr;
1306 }
1307 Where = &Paired;
1308 }
1309
1310 // Call offsetsCanBeCombined with modify = true so that the offsets are
1311 // correct for the new instruction. This should return true, because
1312 // this function should only be called on CombineInfo objects that
1313 // have already been confirmed to be mergeable.
1314 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1315 offsetsCanBeCombined(CI, *STM, Paired, true);
1316 return Where;
1317}
1318
1319// Copy the merged load result from DestReg to the original dest regs of CI and
1320// Paired.
1321void SILoadStoreOptimizer::copyToDestRegs(
1322 CombineInfo &CI, CombineInfo &Paired,
1323 MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
1324 Register DestReg) const {
1325 MachineBasicBlock *MBB = CI.I->getParent();
1326 DebugLoc DL = CI.I->getDebugLoc();
1327
1328 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1329
1330 // Copy to the old destination registers.
1331 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1332 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1333 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1334
1335 // The constrained sload instructions in S_LOAD_IMM class will have
1336 // `early-clobber` flag in the dst operand. Remove the flag before using the
1337 // MOs in copies.
1338 Dest0->setIsEarlyClobber(false);
1339 Dest1->setIsEarlyClobber(false);
1340
1341 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1342 .add(*Dest0) // Copy to same destination including flags and sub reg.
1343 .addReg(DestReg, 0, SubRegIdx0);
1344 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1345 .add(*Dest1)
1346 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1347}
1348
1349// Return a register for the source of the merged store after copying the
1350// original source regs of CI and Paired into it.
1352SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1353 MachineBasicBlock::iterator InsertBefore,
1354 AMDGPU::OpName OpName) const {
1355 MachineBasicBlock *MBB = CI.I->getParent();
1356 DebugLoc DL = CI.I->getDebugLoc();
1357
1358 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1359
1360 // Copy to the new source register.
1361 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1362 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1363
1364 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1365 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1366
1367 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1368 .add(*Src0)
1369 .addImm(SubRegIdx0)
1370 .add(*Src1)
1371 .addImm(SubRegIdx1);
1372
1373 return SrcReg;
1374}
1375
1376unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1377 if (STM->ldsRequiresM0Init())
1378 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1379 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1380}
1381
1382unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1383 if (STM->ldsRequiresM0Init())
1384 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1385
1386 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1387 : AMDGPU::DS_READ2ST64_B64_gfx9;
1388}
1389
1391SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1392 MachineBasicBlock::iterator InsertBefore) {
1393 MachineBasicBlock *MBB = CI.I->getParent();
1394
1395 // Be careful, since the addresses could be subregisters themselves in weird
1396 // cases, like vectors of pointers.
1397 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1398
1399 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1400 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1401 unsigned Opc =
1402 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1403
1404 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1405 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1406
1407 const MCInstrDesc &Read2Desc = TII->get(Opc);
1408
1409 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1410 Register DestReg = MRI->createVirtualRegister(SuperRC);
1411
1412 DebugLoc DL = CI.I->getDebugLoc();
1413
1414 Register BaseReg = AddrReg->getReg();
1415 unsigned BaseSubReg = AddrReg->getSubReg();
1416 unsigned BaseRegFlags = 0;
1417 if (CI.BaseOff) {
1418 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1419 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1420 .addImm(CI.BaseOff);
1421
1422 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1423 BaseRegFlags = RegState::Kill;
1424
1425 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1426 .addReg(ImmReg)
1427 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1428 .addImm(0); // clamp bit
1429 BaseSubReg = 0;
1430 }
1431
1432 MachineInstrBuilder Read2 =
1433 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1434 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1435 .addImm(NewOffset0) // offset0
1436 .addImm(NewOffset1) // offset1
1437 .addImm(0) // gds
1438 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1439
1440 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1441
1442 CI.I->eraseFromParent();
1443 Paired.I->eraseFromParent();
1444
1445 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1446 return Read2;
1447}
1448
1449unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1450 if (STM->ldsRequiresM0Init())
1451 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1452 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1453 : AMDGPU::DS_WRITE2_B64_gfx9;
1454}
1455
1456unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1457 if (STM->ldsRequiresM0Init())
1458 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1459 : AMDGPU::DS_WRITE2ST64_B64;
1460
1461 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1462 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1463}
1464
1465MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1466 CombineInfo &CI, CombineInfo &Paired,
1467 MachineBasicBlock::iterator InsertBefore) {
1468 MachineBasicBlock *MBB = CI.I->getParent();
1469
1470 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1471 // sure we preserve the subregister index and any register flags set on them.
1472 const MachineOperand *AddrReg =
1473 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1474 const MachineOperand *Data0 =
1475 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1476 const MachineOperand *Data1 =
1477 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1478
1479 unsigned NewOffset0 = CI.Offset;
1480 unsigned NewOffset1 = Paired.Offset;
1481 unsigned Opc =
1482 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1483
1484 if (NewOffset0 > NewOffset1) {
1485 // Canonicalize the merged instruction so the smaller offset comes first.
1486 std::swap(NewOffset0, NewOffset1);
1487 std::swap(Data0, Data1);
1488 }
1489
1490 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1491 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1492
1493 const MCInstrDesc &Write2Desc = TII->get(Opc);
1494 DebugLoc DL = CI.I->getDebugLoc();
1495
1496 Register BaseReg = AddrReg->getReg();
1497 unsigned BaseSubReg = AddrReg->getSubReg();
1498 unsigned BaseRegFlags = 0;
1499 if (CI.BaseOff) {
1500 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1501 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1502 .addImm(CI.BaseOff);
1503
1504 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1505 BaseRegFlags = RegState::Kill;
1506
1507 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1508 .addReg(ImmReg)
1509 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1510 .addImm(0); // clamp bit
1511 BaseSubReg = 0;
1512 }
1513
1514 MachineInstrBuilder Write2 =
1515 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1516 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1517 .add(*Data0) // data0
1518 .add(*Data1) // data1
1519 .addImm(NewOffset0) // offset0
1520 .addImm(NewOffset1) // offset1
1521 .addImm(0) // gds
1522 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1523
1524 CI.I->eraseFromParent();
1525 Paired.I->eraseFromParent();
1526
1527 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1528 return Write2;
1529}
1530
1532SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1533 MachineBasicBlock::iterator InsertBefore) {
1534 MachineBasicBlock *MBB = CI.I->getParent();
1535 DebugLoc DL = CI.I->getDebugLoc();
1536 const unsigned Opcode = getNewOpcode(CI, Paired);
1537
1538 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1539
1540 Register DestReg = MRI->createVirtualRegister(SuperRC);
1541 unsigned MergedDMask = CI.DMask | Paired.DMask;
1542 unsigned DMaskIdx =
1543 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1544
1545 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1546 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1547 if (I == DMaskIdx)
1548 MIB.addImm(MergedDMask);
1549 else
1550 MIB.add((*CI.I).getOperand(I));
1551 }
1552
1553 // It shouldn't be possible to get this far if the two instructions
1554 // don't have a single memoperand, because MachineInstr::mayAlias()
1555 // will return true if this is the case.
1556 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1557
1558 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1559
1560 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1561
1562 CI.I->eraseFromParent();
1563 Paired.I->eraseFromParent();
1564 return New;
1565}
1566
1567MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1568 CombineInfo &CI, CombineInfo &Paired,
1569 MachineBasicBlock::iterator InsertBefore) {
1570 MachineBasicBlock *MBB = CI.I->getParent();
1571 DebugLoc DL = CI.I->getDebugLoc();
1572 const unsigned Opcode = getNewOpcode(CI, Paired);
1573
1574 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1575
1576 Register DestReg = MRI->createVirtualRegister(SuperRC);
1577 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1578
1579 // It shouldn't be possible to get this far if the two instructions
1580 // don't have a single memoperand, because MachineInstr::mayAlias()
1581 // will return true if this is the case.
1582 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1583
1585 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1586 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1587 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1588 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1589 New.addImm(MergedOffset);
1590 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1591
1592 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1593
1594 CI.I->eraseFromParent();
1595 Paired.I->eraseFromParent();
1596 return New;
1597}
1598
1599MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1600 CombineInfo &CI, CombineInfo &Paired,
1601 MachineBasicBlock::iterator InsertBefore) {
1602 MachineBasicBlock *MBB = CI.I->getParent();
1603 DebugLoc DL = CI.I->getDebugLoc();
1604
1605 const unsigned Opcode = getNewOpcode(CI, Paired);
1606
1607 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1608
1609 // Copy to the new source register.
1610 Register DestReg = MRI->createVirtualRegister(SuperRC);
1611 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1612
1613 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1614
1615 AddressRegs Regs = getRegs(Opcode, *TII);
1616
1617 if (Regs.VAddr)
1618 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1619
1620 // It shouldn't be possible to get this far if the two instructions
1621 // don't have a single memoperand, because MachineInstr::mayAlias()
1622 // will return true if this is the case.
1623 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1624
1625 MachineInstr *New =
1626 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1627 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1628 .addImm(MergedOffset) // offset
1629 .addImm(CI.CPol) // cpol
1630 .addImm(0) // swz
1631 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1632
1633 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1634
1635 CI.I->eraseFromParent();
1636 Paired.I->eraseFromParent();
1637 return New;
1638}
1639
1640MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1641 CombineInfo &CI, CombineInfo &Paired,
1642 MachineBasicBlock::iterator InsertBefore) {
1643 MachineBasicBlock *MBB = CI.I->getParent();
1644 DebugLoc DL = CI.I->getDebugLoc();
1645
1646 const unsigned Opcode = getNewOpcode(CI, Paired);
1647
1648 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1649
1650 // Copy to the new source register.
1651 Register DestReg = MRI->createVirtualRegister(SuperRC);
1652 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1653
1654 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1655
1656 AddressRegs Regs = getRegs(Opcode, *TII);
1657
1658 if (Regs.VAddr)
1659 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1660
1661 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1662 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1663 // and use XYZ of XYZW to enable the merge.
1664 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1665 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1666 NumCombinedComponents = 4;
1667 unsigned JoinedFormat =
1668 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1669
1670 // It shouldn't be possible to get this far if the two instructions
1671 // don't have a single memoperand, because MachineInstr::mayAlias()
1672 // will return true if this is the case.
1673 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1674
1675 MachineInstr *New =
1676 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1677 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1678 .addImm(MergedOffset) // offset
1679 .addImm(JoinedFormat) // format
1680 .addImm(CI.CPol) // cpol
1681 .addImm(0) // swz
1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1683
1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1685
1686 CI.I->eraseFromParent();
1687 Paired.I->eraseFromParent();
1688 return New;
1689}
1690
1691MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1692 CombineInfo &CI, CombineInfo &Paired,
1693 MachineBasicBlock::iterator InsertBefore) {
1694 MachineBasicBlock *MBB = CI.I->getParent();
1695 DebugLoc DL = CI.I->getDebugLoc();
1696
1697 const unsigned Opcode = getNewOpcode(CI, Paired);
1698
1699 Register SrcReg =
1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1701
1702 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1703 .addReg(SrcReg, RegState::Kill);
1704
1705 AddressRegs Regs = getRegs(Opcode, *TII);
1706
1707 if (Regs.VAddr)
1708 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1709
1710 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1711 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1712 // and use XYZ of XYZW to enable the merge.
1713 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1714 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1715 NumCombinedComponents = 4;
1716 unsigned JoinedFormat =
1717 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1718
1719 // It shouldn't be possible to get this far if the two instructions
1720 // don't have a single memoperand, because MachineInstr::mayAlias()
1721 // will return true if this is the case.
1722 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1723
1724 MachineInstr *New =
1725 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1726 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1727 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1728 .addImm(JoinedFormat) // format
1729 .addImm(CI.CPol) // cpol
1730 .addImm(0) // swz
1731 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1732
1733 CI.I->eraseFromParent();
1734 Paired.I->eraseFromParent();
1735 return New;
1736}
1737
1738MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1739 CombineInfo &CI, CombineInfo &Paired,
1740 MachineBasicBlock::iterator InsertBefore) {
1741 MachineBasicBlock *MBB = CI.I->getParent();
1742 DebugLoc DL = CI.I->getDebugLoc();
1743
1744 const unsigned Opcode = getNewOpcode(CI, Paired);
1745
1746 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1747 Register DestReg = MRI->createVirtualRegister(SuperRC);
1748
1749 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1750
1751 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1752 MIB.add(*SAddr);
1753
1754 MachineInstr *New =
1755 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1756 .addImm(std::min(CI.Offset, Paired.Offset))
1757 .addImm(CI.CPol)
1758 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1759
1760 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1761
1762 CI.I->eraseFromParent();
1763 Paired.I->eraseFromParent();
1764 return New;
1765}
1766
1767MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1768 CombineInfo &CI, CombineInfo &Paired,
1769 MachineBasicBlock::iterator InsertBefore) {
1770 MachineBasicBlock *MBB = CI.I->getParent();
1771 DebugLoc DL = CI.I->getDebugLoc();
1772
1773 const unsigned Opcode = getNewOpcode(CI, Paired);
1774
1775 Register SrcReg =
1776 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1777
1778 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1779 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1780 .addReg(SrcReg, RegState::Kill);
1781
1782 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1783 MIB.add(*SAddr);
1784
1785 MachineInstr *New =
1786 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1787 .addImm(CI.CPol)
1788 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1789
1790 CI.I->eraseFromParent();
1791 Paired.I->eraseFromParent();
1792 return New;
1793}
1794
1797 unsigned Width) {
1798 // Conservatively returns true if not found the MMO.
1799 return STM.isXNACKEnabled() &&
1800 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1801}
1802
1803unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1804 const CombineInfo &Paired) {
1805 const unsigned Width = CI.Width + Paired.Width;
1806
1807 switch (getCommonInstClass(CI, Paired)) {
1808 default:
1809 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1810 // FIXME: Handle d16 correctly
1811 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1812 Width);
1813 case TBUFFER_LOAD:
1814 case TBUFFER_STORE:
1815 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1816 Width);
1817
1818 case UNKNOWN:
1819 llvm_unreachable("Unknown instruction class");
1820 case S_BUFFER_LOAD_IMM: {
1821 // If XNACK is enabled, use the constrained opcodes when the first load is
1822 // under-aligned.
1823 bool NeedsConstrainedOpc =
1824 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1825 switch (Width) {
1826 default:
1827 return 0;
1828 case 2:
1829 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1830 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1831 case 3:
1832 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1833 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1834 case 4:
1835 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1836 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1837 case 8:
1838 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1839 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1840 }
1841 }
1842 case S_BUFFER_LOAD_SGPR_IMM: {
1843 // If XNACK is enabled, use the constrained opcodes when the first load is
1844 // under-aligned.
1845 bool NeedsConstrainedOpc =
1846 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1847 switch (Width) {
1848 default:
1849 return 0;
1850 case 2:
1851 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1852 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1853 case 3:
1854 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1855 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1856 case 4:
1857 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1858 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1859 case 8:
1860 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1861 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1862 }
1863 }
1864 case S_LOAD_IMM: {
1865 // If XNACK is enabled, use the constrained opcodes when the first load is
1866 // under-aligned.
1867 bool NeedsConstrainedOpc =
1868 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1869 switch (Width) {
1870 default:
1871 return 0;
1872 case 2:
1873 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1874 : AMDGPU::S_LOAD_DWORDX2_IMM;
1875 case 3:
1876 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1877 : AMDGPU::S_LOAD_DWORDX3_IMM;
1878 case 4:
1879 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1880 : AMDGPU::S_LOAD_DWORDX4_IMM;
1881 case 8:
1882 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1883 : AMDGPU::S_LOAD_DWORDX8_IMM;
1884 }
1885 }
1886 case GLOBAL_LOAD:
1887 switch (Width) {
1888 default:
1889 return 0;
1890 case 2:
1891 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1892 case 3:
1893 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1894 case 4:
1895 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1896 }
1897 case GLOBAL_LOAD_SADDR:
1898 switch (Width) {
1899 default:
1900 return 0;
1901 case 2:
1902 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1903 case 3:
1904 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1905 case 4:
1906 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1907 }
1908 case GLOBAL_STORE:
1909 switch (Width) {
1910 default:
1911 return 0;
1912 case 2:
1913 return AMDGPU::GLOBAL_STORE_DWORDX2;
1914 case 3:
1915 return AMDGPU::GLOBAL_STORE_DWORDX3;
1916 case 4:
1917 return AMDGPU::GLOBAL_STORE_DWORDX4;
1918 }
1919 case GLOBAL_STORE_SADDR:
1920 switch (Width) {
1921 default:
1922 return 0;
1923 case 2:
1924 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1925 case 3:
1926 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1927 case 4:
1928 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1929 }
1930 case FLAT_LOAD:
1931 switch (Width) {
1932 default:
1933 return 0;
1934 case 2:
1935 return AMDGPU::FLAT_LOAD_DWORDX2;
1936 case 3:
1937 return AMDGPU::FLAT_LOAD_DWORDX3;
1938 case 4:
1939 return AMDGPU::FLAT_LOAD_DWORDX4;
1940 }
1941 case FLAT_STORE:
1942 switch (Width) {
1943 default:
1944 return 0;
1945 case 2:
1946 return AMDGPU::FLAT_STORE_DWORDX2;
1947 case 3:
1948 return AMDGPU::FLAT_STORE_DWORDX3;
1949 case 4:
1950 return AMDGPU::FLAT_STORE_DWORDX4;
1951 }
1952 case FLAT_LOAD_SADDR:
1953 switch (Width) {
1954 default:
1955 return 0;
1956 case 2:
1957 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
1958 case 3:
1959 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
1960 case 4:
1961 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
1962 }
1963 case FLAT_STORE_SADDR:
1964 switch (Width) {
1965 default:
1966 return 0;
1967 case 2:
1968 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
1969 case 3:
1970 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
1971 case 4:
1972 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
1973 }
1974 case MIMG:
1975 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1976 "No overlaps");
1977 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1978 }
1979}
1980
1981std::pair<unsigned, unsigned>
1982SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1983 const CombineInfo &Paired) {
1984 assert((CI.InstClass != MIMG ||
1985 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1986 CI.Width + Paired.Width)) &&
1987 "No overlaps");
1988
1989 unsigned Idx0;
1990 unsigned Idx1;
1991
1992 static const unsigned Idxs[5][4] = {
1993 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1994 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1995 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1996 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1997 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1998 };
1999
2000 assert(CI.Width >= 1 && CI.Width <= 4);
2001 assert(Paired.Width >= 1 && Paired.Width <= 4);
2002
2003 if (Paired < CI) {
2004 Idx1 = Idxs[0][Paired.Width - 1];
2005 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2006 } else {
2007 Idx0 = Idxs[0][CI.Width - 1];
2008 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2009 }
2010
2011 return {Idx0, Idx1};
2012}
2013
2014const TargetRegisterClass *
2015SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
2016 const CombineInfo &Paired) const {
2017 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2018 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2019 switch (CI.Width + Paired.Width) {
2020 default:
2021 return nullptr;
2022 case 2:
2023 return &AMDGPU::SReg_64_XEXECRegClass;
2024 case 3:
2025 return &AMDGPU::SGPR_96RegClass;
2026 case 4:
2027 return &AMDGPU::SGPR_128RegClass;
2028 case 8:
2029 return &AMDGPU::SGPR_256RegClass;
2030 case 16:
2031 return &AMDGPU::SGPR_512RegClass;
2032 }
2033 }
2034
2035 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2036 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2037 ? TRI->getAGPRClassForBitWidth(BitWidth)
2038 : TRI->getVGPRClassForBitWidth(BitWidth);
2039}
2040
2041MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
2042 CombineInfo &CI, CombineInfo &Paired,
2043 MachineBasicBlock::iterator InsertBefore) {
2044 MachineBasicBlock *MBB = CI.I->getParent();
2045 DebugLoc DL = CI.I->getDebugLoc();
2046
2047 const unsigned Opcode = getNewOpcode(CI, Paired);
2048
2049 Register SrcReg =
2050 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
2051
2052 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
2053 .addReg(SrcReg, RegState::Kill);
2054
2055 AddressRegs Regs = getRegs(Opcode, *TII);
2056
2057 if (Regs.VAddr)
2058 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2059
2060
2061 // It shouldn't be possible to get this far if the two instructions
2062 // don't have a single memoperand, because MachineInstr::mayAlias()
2063 // will return true if this is the case.
2064 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2065
2066 MachineInstr *New =
2067 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2068 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2069 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
2070 .addImm(CI.CPol) // cpol
2071 .addImm(0) // swz
2072 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2073
2074 CI.I->eraseFromParent();
2075 Paired.I->eraseFromParent();
2076 return New;
2077}
2078
2080SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
2081 APInt V(32, Val, true);
2082 if (TII->isInlineConstant(V))
2083 return MachineOperand::CreateImm(Val);
2084
2085 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2086 MachineInstr *Mov =
2087 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
2088 TII->get(AMDGPU::S_MOV_B32), Reg)
2089 .addImm(Val);
2090 (void)Mov;
2091 LLVM_DEBUG(dbgs() << " "; Mov->dump());
2092 return MachineOperand::CreateReg(Reg, false);
2093}
2094
2095// Compute base address using Addr and return the final register.
2096Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
2097 const MemAddress &Addr) const {
2098 MachineBasicBlock *MBB = MI.getParent();
2099 MachineBasicBlock::iterator MBBI = MI.getIterator();
2100 DebugLoc DL = MI.getDebugLoc();
2101
2102 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2103 Addr.Base.LoSubReg) &&
2104 "Expected 32-bit Base-Register-Low!!");
2105
2106 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2107 Addr.Base.HiSubReg) &&
2108 "Expected 32-bit Base-Register-Hi!!");
2109
2110 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2111 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2112 MachineOperand OffsetHi =
2113 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2114
2115 const auto *CarryRC = TRI->getWaveMaskRegClass();
2116 Register CarryReg = MRI->createVirtualRegister(CarryRC);
2117 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2118
2119 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2120 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2121 MachineInstr *LoHalf =
2122 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2123 .addReg(CarryReg, RegState::Define)
2124 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2125 .add(OffsetLo)
2126 .addImm(0); // clamp bit
2127 (void)LoHalf;
2128 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
2129
2130 MachineInstr *HiHalf =
2131 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2132 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2133 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2134 .add(OffsetHi)
2135 .addReg(CarryReg, RegState::Kill)
2136 .addImm(0); // clamp bit
2137 (void)HiHalf;
2138 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2139
2140 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2141 MachineInstr *FullBase =
2142 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2143 .addReg(DestSub0)
2144 .addImm(AMDGPU::sub0)
2145 .addReg(DestSub1)
2146 .addImm(AMDGPU::sub1);
2147 (void)FullBase;
2148 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2149
2150 return FullDestReg;
2151}
2152
2153// Update base and offset with the NewBase and NewOffset in MI.
2154void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2155 Register NewBase,
2156 int32_t NewOffset) const {
2157 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2158 Base->setReg(NewBase);
2159 Base->setIsKill(false);
2160 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2161}
2162
2163std::optional<int32_t>
2164SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2165 if (Op.isImm())
2166 return Op.getImm();
2167
2168 if (!Op.isReg())
2169 return std::nullopt;
2170
2171 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2172 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2173 !Def->getOperand(1).isImm())
2174 return std::nullopt;
2175
2176 return Def->getOperand(1).getImm();
2177}
2178
2179// Analyze Base and extracts:
2180// - 32bit base registers, subregisters
2181// - 64bit constant offset
2182// Expecting base computation as:
2183// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2184// %LO:vgpr_32, %c:sreg_64_xexec =
2185// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2186// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2187// %Base:vreg_64 =
2188// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2189void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2190 MemAddress &Addr) const {
2191 if (!Base.isReg())
2192 return;
2193
2194 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2195 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2196 || Def->getNumOperands() != 5)
2197 return;
2198
2199 MachineOperand BaseLo = Def->getOperand(1);
2200 MachineOperand BaseHi = Def->getOperand(3);
2201 if (!BaseLo.isReg() || !BaseHi.isReg())
2202 return;
2203
2204 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2205 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2206
2207 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2208 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2209 return;
2210
2211 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2212 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2213
2214 auto Offset0P = extractConstOffset(*Src0);
2215 if (Offset0P)
2216 BaseLo = *Src1;
2217 else {
2218 if (!(Offset0P = extractConstOffset(*Src1)))
2219 return;
2220 BaseLo = *Src0;
2221 }
2222
2223 if (!BaseLo.isReg())
2224 return;
2225
2226 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2227 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2228
2229 if (Src0->isImm())
2230 std::swap(Src0, Src1);
2231
2232 if (!Src1->isImm() || Src0->isImm())
2233 return;
2234
2235 uint64_t Offset1 = Src1->getImm();
2236 BaseHi = *Src0;
2237
2238 if (!BaseHi.isReg())
2239 return;
2240
2241 Addr.Base.LoReg = BaseLo.getReg();
2242 Addr.Base.HiReg = BaseHi.getReg();
2243 Addr.Base.LoSubReg = BaseLo.getSubReg();
2244 Addr.Base.HiSubReg = BaseHi.getSubReg();
2245 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2246}
2247
2248bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2250 MemInfoMap &Visited,
2252
2253 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2254 return false;
2255
2256 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2258 return false;
2259
2262
2263 if (AnchorList.count(&MI))
2264 return false;
2265
2266 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2267
2268 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2269 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2270 return false;
2271 }
2272
2273 // Step1: Find the base-registers and a 64bit constant offset.
2274 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2275 auto [It, Inserted] = Visited.try_emplace(&MI);
2276 MemAddress MAddr;
2277 if (Inserted) {
2278 processBaseWithConstOffset(Base, MAddr);
2279 It->second = MAddr;
2280 } else
2281 MAddr = It->second;
2282
2283 if (MAddr.Offset == 0) {
2284 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2285 " constant offsets that can be promoted.\n";);
2286 return false;
2287 }
2288
2289 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2290 << printReg(MAddr.Base.LoReg, TRI)
2291 << "} Offset: " << MAddr.Offset << "\n\n";);
2292
2293 // Step2: Traverse through MI's basic block and find an anchor(that has the
2294 // same base-registers) with the highest 13bit distance from MI's offset.
2295 // E.g. (64bit loads)
2296 // bb:
2297 // addr1 = &a + 4096; load1 = load(addr1, 0)
2298 // addr2 = &a + 6144; load2 = load(addr2, 0)
2299 // addr3 = &a + 8192; load3 = load(addr3, 0)
2300 // addr4 = &a + 10240; load4 = load(addr4, 0)
2301 // addr5 = &a + 12288; load5 = load(addr5, 0)
2302 //
2303 // Starting from the first load, the optimization will try to find a new base
2304 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2305 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2306 // as the new-base(anchor) because of the maximum distance which can
2307 // accommodate more intermediate bases presumably.
2308 //
2309 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2310 // (&a + 8192) for load1, load2, load4.
2311 // addr = &a + 8192
2312 // load1 = load(addr, -4096)
2313 // load2 = load(addr, -2048)
2314 // load3 = load(addr, 0)
2315 // load4 = load(addr, 2048)
2316 // addr5 = &a + 12288; load5 = load(addr5, 0)
2317 //
2318 MachineInstr *AnchorInst = nullptr;
2319 MemAddress AnchorAddr;
2320 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2322
2323 MachineBasicBlock *MBB = MI.getParent();
2325 MachineBasicBlock::iterator MBBI = MI.getIterator();
2326 ++MBBI;
2327 const SITargetLowering *TLI = STM->getTargetLowering();
2328
2329 for ( ; MBBI != E; ++MBBI) {
2330 MachineInstr &MINext = *MBBI;
2331 // TODO: Support finding an anchor(with same base) from store addresses or
2332 // any other load addresses where the opcodes are different.
2333 if (MINext.getOpcode() != MI.getOpcode() ||
2334 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2335 continue;
2336
2337 const MachineOperand &BaseNext =
2338 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2339 MemAddress MAddrNext;
2340 auto [It, Inserted] = Visited.try_emplace(&MINext);
2341 if (Inserted) {
2342 processBaseWithConstOffset(BaseNext, MAddrNext);
2343 It->second = MAddrNext;
2344 } else
2345 MAddrNext = It->second;
2346
2347 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2348 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2349 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2350 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2351 continue;
2352
2353 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2354
2355 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2357 AM.HasBaseReg = true;
2358 AM.BaseOffs = Dist;
2359 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2360 (uint32_t)std::abs(Dist) > MaxDist) {
2361 MaxDist = std::abs(Dist);
2362
2363 AnchorAddr = MAddrNext;
2364 AnchorInst = &MINext;
2365 }
2366 }
2367
2368 if (AnchorInst) {
2369 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2370 AnchorInst->dump());
2371 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2372 << AnchorAddr.Offset << "\n\n");
2373
2374 // Instead of moving up, just re-compute anchor-instruction's base address.
2375 Register Base = computeBase(MI, AnchorAddr);
2376
2377 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2378 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2379
2380 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2382 AM.HasBaseReg = true;
2383 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2384
2385 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2386 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2387 OtherMI->dump());
2388 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2389 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2390 }
2391 }
2392 AnchorList.insert(AnchorInst);
2393 return true;
2394 }
2395
2396 return false;
2397}
2398
2399void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2400 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2401 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2402 if (AddrList.front().InstClass == CI.InstClass &&
2403 AddrList.front().IsAGPR == CI.IsAGPR &&
2404 AddrList.front().hasSameBaseAddress(CI)) {
2405 AddrList.emplace_back(CI);
2406 return;
2407 }
2408 }
2409
2410 // Base address not found, so add a new list.
2411 MergeableInsts.emplace_back(1, CI);
2412}
2413
2414std::pair<MachineBasicBlock::iterator, bool>
2415SILoadStoreOptimizer::collectMergeableInsts(
2417 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2418 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2419 bool Modified = false;
2420
2421 // Sort potential mergeable instructions into lists. One list per base address.
2422 unsigned Order = 0;
2423 MachineBasicBlock::iterator BlockI = Begin;
2424 for (; BlockI != End; ++BlockI) {
2425 MachineInstr &MI = *BlockI;
2426
2427 // We run this before checking if an address is mergeable, because it can produce
2428 // better code even if the instructions aren't mergeable.
2429 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2430 Modified = true;
2431
2432 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2433 // barriers. We can look after this barrier for separate merges.
2434 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2435 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2436
2437 // Search will resume after this instruction in a separate merge list.
2438 ++BlockI;
2439 break;
2440 }
2441
2442 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2443 if (InstClass == UNKNOWN)
2444 continue;
2445
2446 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2447 int Swizzled =
2448 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2449 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2450 continue;
2451
2452 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2453 const MachineOperand *Fmt =
2454 TII->getNamedOperand(MI, AMDGPU::OpName::format);
2455 if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {
2456 LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
2457 continue;
2458 }
2459 }
2460
2461 CombineInfo CI;
2462 CI.setMI(MI, *this);
2463 CI.Order = Order++;
2464
2465 if (!CI.hasMergeableAddress(*MRI))
2466 continue;
2467
2468 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2469 LLVM_DEBUG(
2470 dbgs() << "cannot merge ds writes with mixed AGPR and VGPR data\n");
2471
2472 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2473 // operands. However we are reporting that ds_write2 shall have
2474 // only VGPR data so that machine copy propagation does not
2475 // create an illegal instruction with a VGPR and AGPR sources.
2476 // Consequenctially if we create such instruction the verifier
2477 // will complain.
2478 continue;
2479 }
2480
2481 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2482
2483 addInstToMergeableList(CI, MergeableInsts);
2484 }
2485
2486 // At this point we have lists of Mergeable instructions.
2487 //
2488 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2489 // list try to find an instruction that can be merged with I. If an instruction
2490 // is found, it is stored in the Paired field. If no instructions are found, then
2491 // the CombineInfo object is deleted from the list.
2492
2493 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2494 E = MergeableInsts.end(); I != E;) {
2495
2496 std::list<CombineInfo> &MergeList = *I;
2497 if (MergeList.size() <= 1) {
2498 // This means we have found only one instruction with a given address
2499 // that can be merged, and we need at least 2 instructions to do a merge,
2500 // so this list can be discarded.
2501 I = MergeableInsts.erase(I);
2502 continue;
2503 }
2504
2505 // Sort the lists by offsets, this way mergeable instructions will be
2506 // adjacent to each other in the list, which will make it easier to find
2507 // matches.
2508 MergeList.sort(
2509 [] (const CombineInfo &A, const CombineInfo &B) {
2510 return A.Offset < B.Offset;
2511 });
2512 ++I;
2513 }
2514
2515 return {BlockI, Modified};
2516}
2517
2518// Scan through looking for adjacent LDS operations with constant offsets from
2519// the same base register. We rely on the scheduler to do the hard work of
2520// clustering nearby loads, and assume these are all adjacent.
2521bool SILoadStoreOptimizer::optimizeBlock(
2522 std::list<std::list<CombineInfo> > &MergeableInsts) {
2523 bool Modified = false;
2524
2525 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2526 E = MergeableInsts.end(); I != E;) {
2527 std::list<CombineInfo> &MergeList = *I;
2528
2529 bool OptimizeListAgain = false;
2530 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2531 // We weren't able to make any changes, so delete the list so we don't
2532 // process the same instructions the next time we try to optimize this
2533 // block.
2534 I = MergeableInsts.erase(I);
2535 continue;
2536 }
2537
2538 Modified = true;
2539
2540 // We made changes, but also determined that there were no more optimization
2541 // opportunities, so we don't need to reprocess the list
2542 if (!OptimizeListAgain) {
2543 I = MergeableInsts.erase(I);
2544 continue;
2545 }
2546 OptimizeAgain = true;
2547 }
2548 return Modified;
2549}
2550
2551bool
2552SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2553 std::list<CombineInfo> &MergeList,
2554 bool &OptimizeListAgain) {
2555 if (MergeList.empty())
2556 return false;
2557
2558 bool Modified = false;
2559
2560 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2561 Next = std::next(I)) {
2562
2563 auto First = I;
2564 auto Second = Next;
2565
2566 if ((*First).Order > (*Second).Order)
2567 std::swap(First, Second);
2568 CombineInfo &CI = *First;
2569 CombineInfo &Paired = *Second;
2570
2571 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2572 if (!Where) {
2573 ++I;
2574 continue;
2575 }
2576
2577 Modified = true;
2578
2579 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2580
2582 switch (CI.InstClass) {
2583 default:
2584 llvm_unreachable("unknown InstClass");
2585 break;
2586 case DS_READ:
2587 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2588 break;
2589 case DS_WRITE:
2590 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2591 break;
2592 case S_BUFFER_LOAD_IMM:
2593 case S_BUFFER_LOAD_SGPR_IMM:
2594 case S_LOAD_IMM:
2595 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2596 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2597 break;
2598 case BUFFER_LOAD:
2599 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2600 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2601 break;
2602 case BUFFER_STORE:
2603 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2604 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2605 break;
2606 case MIMG:
2607 NewMI = mergeImagePair(CI, Paired, Where->I);
2608 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2609 break;
2610 case TBUFFER_LOAD:
2611 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2612 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2613 break;
2614 case TBUFFER_STORE:
2615 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2616 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2617 break;
2618 case FLAT_LOAD:
2619 case FLAT_LOAD_SADDR:
2620 case GLOBAL_LOAD:
2621 case GLOBAL_LOAD_SADDR:
2622 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2623 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2624 break;
2625 case FLAT_STORE:
2626 case FLAT_STORE_SADDR:
2627 case GLOBAL_STORE:
2628 case GLOBAL_STORE_SADDR:
2629 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2630 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2631 break;
2632 }
2633 CI.setMI(NewMI, *this);
2634 CI.Order = Where->Order;
2635 if (I == Second)
2636 I = Next;
2637
2638 MergeList.erase(Second);
2639 }
2640
2641 return Modified;
2642}
2643
2644bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2645 if (skipFunction(MF.getFunction()))
2646 return false;
2647 return SILoadStoreOptimizer(
2648 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2649 .run(MF);
2650}
2651
2652bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2653 STM = &MF.getSubtarget<GCNSubtarget>();
2654 if (!STM->loadStoreOptEnabled())
2655 return false;
2656
2657 TII = STM->getInstrInfo();
2658 TRI = &TII->getRegisterInfo();
2659
2660 MRI = &MF.getRegInfo();
2661
2662 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2663
2664 bool Modified = false;
2665
2666 // Contains the list of instructions for which constant offsets are being
2667 // promoted to the IMM. This is tracked for an entire block at time.
2669 MemInfoMap Visited;
2670
2671 for (MachineBasicBlock &MBB : MF) {
2672 MachineBasicBlock::iterator SectionEnd;
2673 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2674 I = SectionEnd) {
2675 bool CollectModified;
2676 std::list<std::list<CombineInfo>> MergeableInsts;
2677
2678 // First pass: Collect list of all instructions we know how to merge in a
2679 // subset of the block.
2680 std::tie(SectionEnd, CollectModified) =
2681 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2682
2683 Modified |= CollectModified;
2684
2685 do {
2686 OptimizeAgain = false;
2687 Modified |= optimizeBlock(MergeableInsts);
2688 } while (OptimizeAgain);
2689 }
2690
2691 Visited.clear();
2692 AnchorList.clear();
2693 }
2694
2695 return Modified;
2696}
2697
2701 MFPropsModifier _(*this, MF);
2702
2703 if (MF.getFunction().hasOptNone())
2704 return PreservedAnalyses::all();
2705
2707 .getManager();
2709
2710 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2711 if (!Changed)
2712 return PreservedAnalyses::all();
2713
2716 return PA;
2717}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
#define LLVM_DEBUG(...)
Definition: Debug.h:119
support::ulittle16_t & Lo
Definition: aarch32.cpp:205
support::ulittle16_t & Hi
Definition: aarch32.cpp:204
A manager for alias analyses.
LLVM_ABI Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
Definition: APInt.h:78
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:700
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:678
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:316
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:757
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:656
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition: Pass.cpp:146
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:85
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:660
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition: SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:362
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...