71#define DEBUG_TYPE "si-load-store-opt"
79 S_BUFFER_LOAD_SGPR_IMM,
98 unsigned char NumVAddrs = 0;
101 bool SOffset =
false;
109const unsigned MaxAddressRegs = 12 + 1 + 1;
111class SILoadStoreOptimizer {
120 InstClassEnum InstClass;
124 int AddrIdx[MaxAddressRegs];
126 unsigned NumAddresses;
129 bool hasSameBaseAddress(
const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
134 for (
unsigned i = 0; i < NumAddresses; i++) {
137 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
156 for (
unsigned i = 0; i < NumAddresses; ++i) {
165 if (!AddrOp->
isReg())
171 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
176 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
186 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
190 struct BaseRegisters {
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
216 static bool dmasksCanBeCombined(
const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
220 CombineInfo &Paired,
bool Modify =
false);
221 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
222 const CombineInfo &Paired);
223 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
224 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
225 const CombineInfo &Paired);
227 getTargetRegisterClass(
const CombineInfo &CI,
228 const CombineInfo &Paired)
const;
231 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
233 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
236 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
238 AMDGPU::OpName
OpName)
const;
240 unsigned read2Opcode(
unsigned EltSize)
const;
241 unsigned read2ST64Opcode(
unsigned EltSize)
const;
243 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
246 unsigned write2Opcode(
unsigned EltSize)
const;
247 unsigned write2ST64Opcode(
unsigned EltSize)
const;
249 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
252 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
255 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
258 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
267 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
270 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
273 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
277 int32_t NewOffset)
const;
289 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
294 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
297 const CombineInfo &Paired);
299 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
300 const CombineInfo &Paired);
302 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
303 bool &OptimizeListAgain);
304 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
334 const unsigned Opc =
MI.getOpcode();
340 if (
TII.isImage(
MI)) {
342 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
350 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
351 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
352 case AMDGPU::S_LOAD_DWORD_IMM:
353 case AMDGPU::GLOBAL_LOAD_DWORD:
354 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
355 case AMDGPU::GLOBAL_STORE_DWORD:
356 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
357 case AMDGPU::FLAT_LOAD_DWORD:
358 case AMDGPU::FLAT_STORE_DWORD:
359 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
360 case AMDGPU::FLAT_STORE_DWORD_SADDR:
362 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
363 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
366 case AMDGPU::S_LOAD_DWORDX2_IMM:
367 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
368 case AMDGPU::GLOBAL_LOAD_DWORDX2:
369 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX2:
371 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX2:
373 case AMDGPU::FLAT_STORE_DWORDX2:
374 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
375 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
377 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
378 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
379 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
380 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
381 case AMDGPU::S_LOAD_DWORDX3_IMM:
382 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
383 case AMDGPU::GLOBAL_LOAD_DWORDX3:
384 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
385 case AMDGPU::GLOBAL_STORE_DWORDX3:
386 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
387 case AMDGPU::FLAT_LOAD_DWORDX3:
388 case AMDGPU::FLAT_STORE_DWORDX3:
389 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
390 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
392 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
393 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
394 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
395 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
396 case AMDGPU::S_LOAD_DWORDX4_IMM:
397 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
398 case AMDGPU::GLOBAL_LOAD_DWORDX4:
399 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
400 case AMDGPU::GLOBAL_STORE_DWORDX4:
401 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
402 case AMDGPU::FLAT_LOAD_DWORDX4:
403 case AMDGPU::FLAT_STORE_DWORDX4:
404 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
405 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
407 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
408 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
409 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
410 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
411 case AMDGPU::S_LOAD_DWORDX8_IMM:
412 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
414 case AMDGPU::DS_READ_B32:
415 case AMDGPU::DS_READ_B32_gfx9:
416 case AMDGPU::DS_WRITE_B32:
417 case AMDGPU::DS_WRITE_B32_gfx9:
419 case AMDGPU::DS_READ_B64:
420 case AMDGPU::DS_READ_B64_gfx9:
421 case AMDGPU::DS_WRITE_B64:
422 case AMDGPU::DS_WRITE_B64_gfx9:
437 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
438 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
439 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
440 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
441 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
442 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
443 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
444 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
445 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
446 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
447 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
448 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
454 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
455 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
456 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
457 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
458 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
459 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
460 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
461 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
462 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
463 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
464 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
465 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
482 if (
TII.get(
Opc).mayStore() || !
TII.get(
Opc).mayLoad() ||
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
508 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
509 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
510 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
511 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
512 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
513 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
514 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
515 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
516 return TBUFFER_STORE;
520 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
529 return S_BUFFER_LOAD_IMM;
530 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
538 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
539 return S_BUFFER_LOAD_SGPR_IMM;
540 case AMDGPU::S_LOAD_DWORD_IMM:
541 case AMDGPU::S_LOAD_DWORDX2_IMM:
542 case AMDGPU::S_LOAD_DWORDX3_IMM:
543 case AMDGPU::S_LOAD_DWORDX4_IMM:
544 case AMDGPU::S_LOAD_DWORDX8_IMM:
545 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
546 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
547 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
548 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
550 case AMDGPU::DS_READ_B32:
551 case AMDGPU::DS_READ_B32_gfx9:
552 case AMDGPU::DS_READ_B64:
553 case AMDGPU::DS_READ_B64_gfx9:
555 case AMDGPU::DS_WRITE_B32:
556 case AMDGPU::DS_WRITE_B32_gfx9:
557 case AMDGPU::DS_WRITE_B64:
558 case AMDGPU::DS_WRITE_B64_gfx9:
560 case AMDGPU::GLOBAL_LOAD_DWORD:
561 case AMDGPU::GLOBAL_LOAD_DWORDX2:
562 case AMDGPU::GLOBAL_LOAD_DWORDX3:
563 case AMDGPU::GLOBAL_LOAD_DWORDX4:
564 case AMDGPU::FLAT_LOAD_DWORD:
565 case AMDGPU::FLAT_LOAD_DWORDX2:
566 case AMDGPU::FLAT_LOAD_DWORDX3:
567 case AMDGPU::FLAT_LOAD_DWORDX4:
569 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
570 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
571 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
572 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
573 return GLOBAL_LOAD_SADDR;
574 case AMDGPU::GLOBAL_STORE_DWORD:
575 case AMDGPU::GLOBAL_STORE_DWORDX2:
576 case AMDGPU::GLOBAL_STORE_DWORDX3:
577 case AMDGPU::GLOBAL_STORE_DWORDX4:
578 case AMDGPU::FLAT_STORE_DWORD:
579 case AMDGPU::FLAT_STORE_DWORDX2:
580 case AMDGPU::FLAT_STORE_DWORDX3:
581 case AMDGPU::FLAT_STORE_DWORDX4:
583 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
584 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
585 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
586 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
587 return GLOBAL_STORE_SADDR;
588 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
589 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
590 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
591 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
592 return FLAT_LOAD_SADDR;
593 case AMDGPU::FLAT_STORE_DWORD_SADDR:
594 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
595 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
596 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
597 return FLAT_STORE_SADDR;
612 return Info->BaseOpcode;
617 case AMDGPU::DS_READ_B32:
618 case AMDGPU::DS_READ_B32_gfx9:
619 case AMDGPU::DS_READ_B64:
620 case AMDGPU::DS_READ_B64_gfx9:
621 case AMDGPU::DS_WRITE_B32:
622 case AMDGPU::DS_WRITE_B32_gfx9:
623 case AMDGPU::DS_WRITE_B64:
624 case AMDGPU::DS_WRITE_B64_gfx9:
626 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
627 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
628 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
629 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
630 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
631 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
632 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
635 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
636 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
644 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
645 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
646 case AMDGPU::S_LOAD_DWORD_IMM:
647 case AMDGPU::S_LOAD_DWORDX2_IMM:
648 case AMDGPU::S_LOAD_DWORDX3_IMM:
649 case AMDGPU::S_LOAD_DWORDX4_IMM:
650 case AMDGPU::S_LOAD_DWORDX8_IMM:
651 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
652 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
653 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
654 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
655 return AMDGPU::S_LOAD_DWORD_IMM;
656 case AMDGPU::GLOBAL_LOAD_DWORD:
657 case AMDGPU::GLOBAL_LOAD_DWORDX2:
658 case AMDGPU::GLOBAL_LOAD_DWORDX3:
659 case AMDGPU::GLOBAL_LOAD_DWORDX4:
660 case AMDGPU::FLAT_LOAD_DWORD:
661 case AMDGPU::FLAT_LOAD_DWORDX2:
662 case AMDGPU::FLAT_LOAD_DWORDX3:
663 case AMDGPU::FLAT_LOAD_DWORDX4:
664 return AMDGPU::FLAT_LOAD_DWORD;
665 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
666 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
667 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
668 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
669 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
670 case AMDGPU::GLOBAL_STORE_DWORD:
671 case AMDGPU::GLOBAL_STORE_DWORDX2:
672 case AMDGPU::GLOBAL_STORE_DWORDX3:
673 case AMDGPU::GLOBAL_STORE_DWORDX4:
674 case AMDGPU::FLAT_STORE_DWORD:
675 case AMDGPU::FLAT_STORE_DWORDX2:
676 case AMDGPU::FLAT_STORE_DWORDX3:
677 case AMDGPU::FLAT_STORE_DWORDX4:
678 return AMDGPU::FLAT_STORE_DWORD;
679 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
680 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
681 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
682 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
683 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
684 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
685 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
686 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
687 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
688 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
689 case AMDGPU::FLAT_STORE_DWORD_SADDR:
690 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
691 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
692 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
693 return AMDGPU::FLAT_STORE_DWORD_SADDR;
704SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
705 const CombineInfo &Paired) {
706 assert(CI.InstClass == Paired.InstClass);
708 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
710 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
724 Result.SOffset =
true;
730 int VAddr0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0);
731 if (VAddr0Idx >= 0) {
732 AMDGPU::OpName RsrcName =
733 TII.isMIMG(
Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
734 int RsrcIdx = AMDGPU::getNamedOperandIdx(
Opc, RsrcName);
735 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
752 Result.SOffset =
true;
760 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
761 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
762 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
763 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
764 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
765 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
766 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
767 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
768 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
769 Result.SOffset =
true;
771 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
773 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
774 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
778 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
779 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
780 case AMDGPU::S_LOAD_DWORD_IMM:
781 case AMDGPU::S_LOAD_DWORDX2_IMM:
782 case AMDGPU::S_LOAD_DWORDX3_IMM:
783 case AMDGPU::S_LOAD_DWORDX4_IMM:
784 case AMDGPU::S_LOAD_DWORDX8_IMM:
785 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
786 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
787 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
788 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
791 case AMDGPU::DS_READ_B32:
792 case AMDGPU::DS_READ_B64:
793 case AMDGPU::DS_READ_B32_gfx9:
794 case AMDGPU::DS_READ_B64_gfx9:
795 case AMDGPU::DS_WRITE_B32:
796 case AMDGPU::DS_WRITE_B64:
797 case AMDGPU::DS_WRITE_B32_gfx9:
798 case AMDGPU::DS_WRITE_B64_gfx9:
801 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
802 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
803 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
804 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
805 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
806 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
807 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
808 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
809 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
810 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
811 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
812 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
813 case AMDGPU::FLAT_STORE_DWORD_SADDR:
814 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
815 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
816 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
819 case AMDGPU::GLOBAL_LOAD_DWORD:
820 case AMDGPU::GLOBAL_LOAD_DWORDX2:
821 case AMDGPU::GLOBAL_LOAD_DWORDX3:
822 case AMDGPU::GLOBAL_LOAD_DWORDX4:
823 case AMDGPU::GLOBAL_STORE_DWORD:
824 case AMDGPU::GLOBAL_STORE_DWORDX2:
825 case AMDGPU::GLOBAL_STORE_DWORDX3:
826 case AMDGPU::GLOBAL_STORE_DWORDX4:
827 case AMDGPU::FLAT_LOAD_DWORD:
828 case AMDGPU::FLAT_LOAD_DWORDX2:
829 case AMDGPU::FLAT_LOAD_DWORDX3:
830 case AMDGPU::FLAT_LOAD_DWORDX4:
831 case AMDGPU::FLAT_STORE_DWORD:
832 case AMDGPU::FLAT_STORE_DWORDX2:
833 case AMDGPU::FLAT_STORE_DWORDX3:
834 case AMDGPU::FLAT_STORE_DWORDX4:
841 const SILoadStoreOptimizer &LSO) {
843 unsigned Opc =
MI->getOpcode();
844 InstClass = getInstClass(
Opc, *LSO.TII);
846 if (InstClass == UNKNOWN)
849 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
854 (
Opc == AMDGPU::DS_READ_B64 ||
Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
859 (
Opc == AMDGPU::DS_WRITE_B64 ||
Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
862 case S_BUFFER_LOAD_IMM:
863 case S_BUFFER_LOAD_SGPR_IMM:
872 if (InstClass == MIMG) {
873 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
877 int OffsetIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::offset);
878 Offset =
I->getOperand(OffsetIdx).getImm();
881 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
882 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
885 EltSize =
Info->BitsPerComp / 8;
888 Width = getOpcodeWidth(*
I, *LSO.TII);
890 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
892 }
else if (InstClass != MIMG) {
893 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
896 AddressRegs Regs = getRegs(
Opc, *LSO.TII);
897 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
900 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
901 AddrIdx[NumAddresses++] =
902 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0) + J;
904 AddrIdx[NumAddresses++] =
905 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::addr);
907 AddrIdx[NumAddresses++] =
908 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::sbase);
910 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
911 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
913 AddrIdx[NumAddresses++] =
914 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::soffset);
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
919 AddrIdx[NumAddresses++] =
920 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
922 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
923 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
924 assert(NumAddresses <= MaxAddressRegs);
926 for (
unsigned J = 0; J < NumAddresses; J++)
927 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
933 "SI Load Store Optimizer",
false,
false)
938char SILoadStoreOptimizerLegacy::
ID = 0;
943 return new SILoadStoreOptimizerLegacy();
949 for (
const auto &
Op :
MI.operands()) {
959bool SILoadStoreOptimizer::canSwapInstructions(
962 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
963 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
965 for (
const auto &BOp :
B.operands()) {
968 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
970 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
979SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
980 const CombineInfo &Paired) {
1000bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
1002 const CombineInfo &Paired) {
1003 assert(CI.InstClass == MIMG);
1006 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1007 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1009 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1013 AMDGPU::OpName OperandsToMatch[] = {
1014 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1015 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1017 for (AMDGPU::OpName
op : OperandsToMatch) {
1018 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
op);
1019 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),
op) !=
Idx)
1022 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
1027 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1028 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1034 if ((1u << AllowedBitsForMin) <= MinMask)
1041 unsigned ComponentCount,
1043 if (ComponentCount > 4)
1062 return NewFormatInfo->
Format;
1075bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1077 CombineInfo &Paired,
1079 assert(CI.InstClass != MIMG);
1083 if (CI.Offset == Paired.Offset)
1087 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1090 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1106 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1107 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1108 NumCombinedComponents = 4;
1116 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1117 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1118 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1119 ElemIndex1 + Paired.Width != ElemIndex0)
1125 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1126 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1127 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1128 if (MinOff % RequiredAlign != 0)
1134 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1135 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1140 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1141 if (EltOffset0 + CI.Width != EltOffset1 &&
1142 EltOffset1 + Paired.Width != EltOffset0)
1148 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1149 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1155 if (CI.Width != Paired.Width &&
1156 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1164 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1165 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1167 CI.Offset = EltOffset0 / 64;
1168 Paired.Offset = EltOffset1 / 64;
1175 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1177 CI.Offset = EltOffset0;
1178 Paired.Offset = EltOffset1;
1184 uint32_t Min = std::min(EltOffset0, EltOffset1);
1187 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1188 if (((Max - Min) & ~Mask) == 0) {
1196 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1197 CI.BaseOff = BaseOff * CI.EltSize;
1198 CI.Offset = (EltOffset0 - BaseOff) / 64;
1199 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1205 if (isUInt<8>(Max - Min)) {
1211 CI.BaseOff = BaseOff * CI.EltSize;
1212 CI.Offset = EltOffset0 - BaseOff;
1213 Paired.Offset = EltOffset1 - BaseOff;
1221bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1222 const CombineInfo &CI,
1223 const CombineInfo &Paired) {
1224 const unsigned Width = (CI.Width + Paired.Width);
1225 switch (CI.InstClass) {
1228 case S_BUFFER_LOAD_IMM:
1229 case S_BUFFER_LOAD_SGPR_IMM:
1245SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1246 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1247 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1249 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1250 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1252 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1253 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1255 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1256 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1258 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1259 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1266SILoadStoreOptimizer::CombineInfo *
1267SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1268 CombineInfo &Paired) {
1271 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1273 assert(CI.InstClass == Paired.InstClass);
1275 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1276 getInstSubclass(Paired.I->getOpcode(), *
TII))
1281 if (CI.InstClass == MIMG) {
1282 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1285 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1292 if (CI.I->mayLoad()) {
1296 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1304 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1314 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1315 offsetsCanBeCombined(CI, *STM, Paired,
true);
1321void SILoadStoreOptimizer::copyToDestRegs(
1322 CombineInfo &CI, CombineInfo &Paired,
1328 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1332 auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1333 auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1338 Dest0->setIsEarlyClobber(
false);
1339 Dest1->setIsEarlyClobber(
false);
1343 .
addReg(DestReg, 0, SubRegIdx0);
1352SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1354 AMDGPU::OpName
OpName)
const {
1358 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1362 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1364 const auto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1365 const auto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1367 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1376unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1378 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1379 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1382unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1384 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1386 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1387 : AMDGPU::DS_READ2ST64_B64_gfx9;
1391SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1397 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1399 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1400 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1402 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1404 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1405 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1410 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1415 unsigned BaseSubReg = AddrReg->getSubReg();
1416 unsigned BaseRegFlags = 0;
1418 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1422 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1425 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1427 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1434 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1440 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1442 CI.I->eraseFromParent();
1443 Paired.I->eraseFromParent();
1449unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1451 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1452 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1453 : AMDGPU::DS_WRITE2_B64_gfx9;
1456unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1458 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1459 : AMDGPU::DS_WRITE2ST64_B64;
1461 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1462 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1466 CombineInfo &CI, CombineInfo &Paired,
1473 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1475 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1477 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1479 unsigned NewOffset0 = CI.Offset;
1480 unsigned NewOffset1 = Paired.Offset;
1482 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1484 if (NewOffset0 > NewOffset1) {
1490 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1491 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1497 unsigned BaseSubReg = AddrReg->
getSubReg();
1498 unsigned BaseRegFlags = 0;
1500 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1504 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1507 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1509 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1516 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1525 Paired.I->eraseFromParent();
1527 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1532SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1536 const unsigned Opcode = getNewOpcode(CI, Paired);
1540 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1541 unsigned MergedDMask = CI.DMask | Paired.DMask;
1543 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1545 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1546 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1548 MIB.addImm(MergedDMask);
1550 MIB.add((*CI.I).getOperand(
I));
1556 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1558 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1560 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1562 CI.I->eraseFromParent();
1563 Paired.I->eraseFromParent();
1568 CombineInfo &CI, CombineInfo &Paired,
1572 const unsigned Opcode = getNewOpcode(CI, Paired);
1576 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1577 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1582 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1586 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1587 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1588 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1589 New.addImm(MergedOffset);
1590 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1592 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1594 CI.I->eraseFromParent();
1595 Paired.I->eraseFromParent();
1600 CombineInfo &CI, CombineInfo &Paired,
1605 const unsigned Opcode = getNewOpcode(CI, Paired);
1610 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1611 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1613 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1615 AddressRegs Regs = getRegs(Opcode, *
TII);
1618 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1623 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1626 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1627 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1628 .addImm(MergedOffset)
1631 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1633 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1635 CI.I->eraseFromParent();
1636 Paired.I->eraseFromParent();
1641 CombineInfo &CI, CombineInfo &Paired,
1646 const unsigned Opcode = getNewOpcode(CI, Paired);
1651 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1652 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1654 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1656 AddressRegs Regs = getRegs(Opcode, *
TII);
1659 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1664 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1665 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1666 NumCombinedComponents = 4;
1667 unsigned JoinedFormat =
1673 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1676 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1677 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1678 .addImm(MergedOffset)
1679 .addImm(JoinedFormat)
1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1686 CI.I->eraseFromParent();
1687 Paired.I->eraseFromParent();
1692 CombineInfo &CI, CombineInfo &Paired,
1697 const unsigned Opcode = getNewOpcode(CI, Paired);
1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1705 AddressRegs Regs = getRegs(Opcode, *
TII);
1708 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1713 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1714 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1715 NumCombinedComponents = 4;
1716 unsigned JoinedFormat =
1722 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1725 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1726 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1727 .addImm(std::min(CI.Offset, Paired.Offset))
1728 .addImm(JoinedFormat)
1731 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1733 CI.I->eraseFromParent();
1734 Paired.I->eraseFromParent();
1739 CombineInfo &CI, CombineInfo &Paired,
1744 const unsigned Opcode = getNewOpcode(CI, Paired);
1747 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1749 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1751 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1755 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1756 .addImm(std::min(CI.Offset, Paired.Offset))
1758 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1760 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1762 CI.I->eraseFromParent();
1763 Paired.I->eraseFromParent();
1768 CombineInfo &CI, CombineInfo &Paired,
1773 const unsigned Opcode = getNewOpcode(CI, Paired);
1776 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1779 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1782 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1786 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1788 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1790 CI.I->eraseFromParent();
1791 Paired.I->eraseFromParent();
1800 (MMOs.
size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1803unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1804 const CombineInfo &Paired) {
1805 const unsigned Width = CI.Width + Paired.Width;
1807 switch (getCommonInstClass(CI, Paired)) {
1809 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1820 case S_BUFFER_LOAD_IMM: {
1823 bool NeedsConstrainedOpc =
1829 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1830 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1832 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1833 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1835 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1836 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1838 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1839 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1842 case S_BUFFER_LOAD_SGPR_IMM: {
1845 bool NeedsConstrainedOpc =
1851 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1852 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1854 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1855 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1857 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1858 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1860 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1861 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1867 bool NeedsConstrainedOpc =
1873 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1874 : AMDGPU::S_LOAD_DWORDX2_IMM;
1876 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1877 : AMDGPU::S_LOAD_DWORDX3_IMM;
1879 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1880 : AMDGPU::S_LOAD_DWORDX4_IMM;
1882 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1883 : AMDGPU::S_LOAD_DWORDX8_IMM;
1891 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1893 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1895 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1897 case GLOBAL_LOAD_SADDR:
1902 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1904 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1906 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1913 return AMDGPU::GLOBAL_STORE_DWORDX2;
1915 return AMDGPU::GLOBAL_STORE_DWORDX3;
1917 return AMDGPU::GLOBAL_STORE_DWORDX4;
1919 case GLOBAL_STORE_SADDR:
1924 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1926 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1928 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1935 return AMDGPU::FLAT_LOAD_DWORDX2;
1937 return AMDGPU::FLAT_LOAD_DWORDX3;
1939 return AMDGPU::FLAT_LOAD_DWORDX4;
1946 return AMDGPU::FLAT_STORE_DWORDX2;
1948 return AMDGPU::FLAT_STORE_DWORDX3;
1950 return AMDGPU::FLAT_STORE_DWORDX4;
1952 case FLAT_LOAD_SADDR:
1957 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
1959 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
1961 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
1963 case FLAT_STORE_SADDR:
1968 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
1970 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
1972 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
1981std::pair<unsigned, unsigned>
1982SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1983 const CombineInfo &Paired) {
1984 assert((CI.InstClass != MIMG ||
1986 CI.Width + Paired.Width)) &&
1992 static const unsigned Idxs[5][4] = {
1993 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1994 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1995 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1996 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1997 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2000 assert(CI.Width >= 1 && CI.Width <= 4);
2001 assert(Paired.Width >= 1 && Paired.Width <= 4);
2004 Idx1 = Idxs[0][Paired.Width - 1];
2005 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2007 Idx0 = Idxs[0][CI.Width - 1];
2008 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2011 return {Idx0, Idx1};
2015SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
2016 const CombineInfo &Paired)
const {
2017 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2018 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2019 switch (CI.Width + Paired.Width) {
2023 return &AMDGPU::SReg_64_XEXECRegClass;
2025 return &AMDGPU::SGPR_96RegClass;
2027 return &AMDGPU::SGPR_128RegClass;
2029 return &AMDGPU::SGPR_256RegClass;
2031 return &AMDGPU::SGPR_512RegClass;
2035 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2036 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2042 CombineInfo &CI, CombineInfo &Paired,
2047 const unsigned Opcode = getNewOpcode(CI, Paired);
2050 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
2055 AddressRegs Regs = getRegs(Opcode, *
TII);
2058 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2064 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2067 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2068 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2069 .addImm(std::min(CI.Offset, Paired.Offset))
2072 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2074 CI.I->eraseFromParent();
2075 Paired.I->eraseFromParent();
2080SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
2082 if (
TII->isInlineConstant(V))
2085 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2087 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
2088 TII->get(AMDGPU::S_MOV_B32), Reg)
2097 const MemAddress &
Addr)
const {
2103 Addr.Base.LoSubReg) &&
2104 "Expected 32-bit Base-Register-Low!!");
2107 Addr.Base.HiSubReg) &&
2108 "Expected 32-bit Base-Register-Hi!!");
2113 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
2115 const auto *CarryRC =
TRI->getWaveMaskRegClass();
2116 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2117 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2119 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2120 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2140 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2156 int32_t NewOffset)
const {
2157 auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2158 Base->setReg(NewBase);
2159 Base->setIsKill(
false);
2160 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2163std::optional<int32_t>
2169 return std::nullopt;
2172 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2173 !
Def->getOperand(1).isImm())
2174 return std::nullopt;
2176 return Def->getOperand(1).getImm();
2190 MemAddress &
Addr)
const {
2195 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2196 ||
Def->getNumOperands() != 5)
2207 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2208 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2211 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2212 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2214 auto Offset0P = extractConstOffset(*Src0);
2218 if (!(Offset0P = extractConstOffset(*Src1)))
2223 if (!BaseLo.
isReg())
2226 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2227 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2232 if (!Src1->isImm() || Src0->isImm())
2238 if (!BaseHi.
isReg())
2245 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2248bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2250 MemInfoMap &Visited,
2268 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2275 auto [It,
Inserted] = Visited.try_emplace(&
MI);
2278 processBaseWithConstOffset(
Base, MAddr);
2283 if (MAddr.Offset == 0) {
2284 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2285 " constant offsets that can be promoted.\n";);
2291 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2319 MemAddress AnchorAddr;
2320 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2334 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2338 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2339 MemAddress MAddrNext;
2340 auto [It,
Inserted] = Visited.try_emplace(&MINext);
2342 processBaseWithConstOffset(BaseNext, MAddrNext);
2343 It->second = MAddrNext;
2345 MAddrNext = It->second;
2347 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2348 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2349 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2350 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2353 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2355 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2360 (
uint32_t)std::abs(Dist) > MaxDist) {
2361 MaxDist = std::abs(Dist);
2363 AnchorAddr = MAddrNext;
2364 AnchorInst = &MINext;
2369 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2370 AnchorInst->
dump());
2372 << AnchorAddr.Offset <<
"\n\n");
2377 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2380 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2383 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2388 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2399void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2400 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2401 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2402 if (AddrList.front().InstClass == CI.InstClass &&
2403 AddrList.front().IsAGPR == CI.IsAGPR &&
2404 AddrList.front().hasSameBaseAddress(CI)) {
2405 AddrList.emplace_back(CI);
2411 MergeableInsts.emplace_back(1, CI);
2414std::pair<MachineBasicBlock::iterator, bool>
2415SILoadStoreOptimizer::collectMergeableInsts(
2418 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2424 for (; BlockI !=
End; ++BlockI) {
2429 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2434 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2442 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2443 if (InstClass == UNKNOWN)
2448 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::swz);
2449 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2452 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2454 TII->getNamedOperand(
MI, AMDGPU::OpName::format);
2462 CI.setMI(
MI, *
this);
2465 if (!CI.hasMergeableAddress(*
MRI))
2468 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2470 dbgs() <<
"cannot merge ds writes with mixed AGPR and VGPR data\n");
2493 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2494 E = MergeableInsts.end();
I != E;) {
2496 std::list<CombineInfo> &MergeList = *
I;
2497 if (MergeList.size() <= 1) {
2501 I = MergeableInsts.erase(
I);
2509 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2510 return A.Offset <
B.Offset;
2521bool SILoadStoreOptimizer::optimizeBlock(
2522 std::list<std::list<CombineInfo> > &MergeableInsts) {
2525 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2526 E = MergeableInsts.end();
I != E;) {
2527 std::list<CombineInfo> &MergeList = *
I;
2529 bool OptimizeListAgain =
false;
2530 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2534 I = MergeableInsts.erase(
I);
2542 if (!OptimizeListAgain) {
2543 I = MergeableInsts.erase(
I);
2546 OptimizeAgain =
true;
2552SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2553 std::list<CombineInfo> &MergeList,
2554 bool &OptimizeListAgain) {
2555 if (MergeList.empty())
2560 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2561 Next = std::next(
I)) {
2566 if ((*First).Order > (*Second).Order)
2568 CombineInfo &CI = *
First;
2569 CombineInfo &Paired = *Second;
2571 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2579 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2582 switch (CI.InstClass) {
2587 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2590 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2592 case S_BUFFER_LOAD_IMM:
2593 case S_BUFFER_LOAD_SGPR_IMM:
2595 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2596 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2599 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2600 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2603 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2604 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2607 NewMI = mergeImagePair(CI, Paired, Where->I);
2608 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2611 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2612 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2615 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2616 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2619 case FLAT_LOAD_SADDR:
2621 case GLOBAL_LOAD_SADDR:
2622 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2623 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2626 case FLAT_STORE_SADDR:
2628 case GLOBAL_STORE_SADDR:
2629 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2630 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2633 CI.setMI(NewMI, *
this);
2634 CI.Order = Where->Order;
2638 MergeList.erase(Second);
2644bool SILoadStoreOptimizerLegacy::runOnMachineFunction(
MachineFunction &MF) {
2647 return SILoadStoreOptimizer(
2648 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2658 TRI = &
TII->getRegisterInfo();
2675 bool CollectModified;
2676 std::list<std::list<CombineInfo>> MergeableInsts;
2680 std::tie(SectionEnd, CollectModified) =
2686 OptimizeAgain =
false;
2688 }
while (OptimizeAgain);
2710 bool Changed = SILoadStoreOptimizer(&AA).
run(MF);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A manager for alias analyses.
LLVM_ABI Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...