71#define DEBUG_TYPE "si-load-store-opt"
79 S_BUFFER_LOAD_SGPR_IMM,
98 unsigned char NumVAddrs = 0;
101 bool SOffset =
false;
109const unsigned MaxAddressRegs = 12 + 1 + 1;
111class SILoadStoreOptimizer {
120 InstClassEnum InstClass;
124 int AddrIdx[MaxAddressRegs];
126 unsigned NumAddresses;
129 bool hasSameBaseAddress(
const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
134 for (
unsigned i = 0; i < NumAddresses; i++) {
137 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
156 for (
unsigned i = 0; i < NumAddresses; ++i) {
165 if (!AddrOp->
isReg())
171 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
176 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
186 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
190 struct BaseRegisters {
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
217 static bool dmasksCanBeCombined(
const CombineInfo &CI,
219 const CombineInfo &Paired);
220 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
221 CombineInfo &Paired,
bool Modify =
false);
222 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
223 const CombineInfo &Paired);
224 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
225 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
226 const CombineInfo &Paired);
228 getTargetRegisterClass(
const CombineInfo &CI,
229 const CombineInfo &Paired)
const;
232 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
234 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
237 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
239 AMDGPU::OpName
OpName)
const;
241 unsigned read2Opcode(
unsigned EltSize)
const;
242 unsigned read2ST64Opcode(
unsigned EltSize)
const;
244 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
247 unsigned write2Opcode(
unsigned EltSize)
const;
248 unsigned write2ST64Opcode(
unsigned EltSize)
const;
249 unsigned getWrite2Opcode(
const CombineInfo &CI)
const;
252 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
255 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
258 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
261 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
264 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
267 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
270 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
273 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
276 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
280 int32_t NewOffset)
const;
292 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
297 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
300 const CombineInfo &Paired);
302 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
303 const CombineInfo &Paired);
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
337 const unsigned Opc =
MI.getOpcode();
343 if (
TII.isImage(
MI)) {
345 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
353 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
355 case AMDGPU::S_LOAD_DWORD_IMM:
356 case AMDGPU::GLOBAL_LOAD_DWORD:
357 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
358 case AMDGPU::GLOBAL_STORE_DWORD:
359 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
360 case AMDGPU::FLAT_LOAD_DWORD:
361 case AMDGPU::FLAT_STORE_DWORD:
362 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
363 case AMDGPU::FLAT_STORE_DWORD_SADDR:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
367 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
368 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
369 case AMDGPU::S_LOAD_DWORDX2_IMM:
370 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
371 case AMDGPU::GLOBAL_LOAD_DWORDX2:
372 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
373 case AMDGPU::GLOBAL_STORE_DWORDX2:
374 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
375 case AMDGPU::FLAT_LOAD_DWORDX2:
376 case AMDGPU::FLAT_STORE_DWORDX2:
377 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
378 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
380 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
381 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
382 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
383 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
384 case AMDGPU::S_LOAD_DWORDX3_IMM:
385 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
386 case AMDGPU::GLOBAL_LOAD_DWORDX3:
387 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
388 case AMDGPU::GLOBAL_STORE_DWORDX3:
389 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
390 case AMDGPU::FLAT_LOAD_DWORDX3:
391 case AMDGPU::FLAT_STORE_DWORDX3:
392 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
393 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
395 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
396 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
397 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
399 case AMDGPU::S_LOAD_DWORDX4_IMM:
400 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
401 case AMDGPU::GLOBAL_LOAD_DWORDX4:
402 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
403 case AMDGPU::GLOBAL_STORE_DWORDX4:
404 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
405 case AMDGPU::FLAT_LOAD_DWORDX4:
406 case AMDGPU::FLAT_STORE_DWORDX4:
407 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
408 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
410 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
411 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
412 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
413 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
414 case AMDGPU::S_LOAD_DWORDX8_IMM:
415 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
417 case AMDGPU::DS_READ_B32:
418 case AMDGPU::DS_READ_B32_gfx9:
419 case AMDGPU::DS_WRITE_B32:
420 case AMDGPU::DS_WRITE_B32_gfx9:
422 case AMDGPU::DS_READ_B64:
423 case AMDGPU::DS_READ_B64_gfx9:
424 case AMDGPU::DS_WRITE_B64:
425 case AMDGPU::DS_WRITE_B64_gfx9:
440 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
441 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
442 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
443 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
444 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
445 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
446 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
447 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
448 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
453 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
457 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
458 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
459 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
460 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
461 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
462 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
463 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
464 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
465 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
470 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
485 if (
TII.get(
Opc).mayStore() || !
TII.get(
Opc).mayLoad() ||
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
511 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
512 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
513 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
514 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
515 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
516 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
519 return TBUFFER_STORE;
523 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
529 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
532 return S_BUFFER_LOAD_IMM;
533 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
538 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
539 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
542 return S_BUFFER_LOAD_SGPR_IMM;
543 case AMDGPU::S_LOAD_DWORD_IMM:
544 case AMDGPU::S_LOAD_DWORDX2_IMM:
545 case AMDGPU::S_LOAD_DWORDX3_IMM:
546 case AMDGPU::S_LOAD_DWORDX4_IMM:
547 case AMDGPU::S_LOAD_DWORDX8_IMM:
548 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
549 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
550 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
551 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
553 case AMDGPU::DS_READ_B32:
554 case AMDGPU::DS_READ_B32_gfx9:
555 case AMDGPU::DS_READ_B64:
556 case AMDGPU::DS_READ_B64_gfx9:
558 case AMDGPU::DS_WRITE_B32:
559 case AMDGPU::DS_WRITE_B32_gfx9:
560 case AMDGPU::DS_WRITE_B64:
561 case AMDGPU::DS_WRITE_B64_gfx9:
563 case AMDGPU::GLOBAL_LOAD_DWORD:
564 case AMDGPU::GLOBAL_LOAD_DWORDX2:
565 case AMDGPU::GLOBAL_LOAD_DWORDX3:
566 case AMDGPU::GLOBAL_LOAD_DWORDX4:
567 case AMDGPU::FLAT_LOAD_DWORD:
568 case AMDGPU::FLAT_LOAD_DWORDX2:
569 case AMDGPU::FLAT_LOAD_DWORDX3:
570 case AMDGPU::FLAT_LOAD_DWORDX4:
572 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
573 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
574 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
575 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
576 return GLOBAL_LOAD_SADDR;
577 case AMDGPU::GLOBAL_STORE_DWORD:
578 case AMDGPU::GLOBAL_STORE_DWORDX2:
579 case AMDGPU::GLOBAL_STORE_DWORDX3:
580 case AMDGPU::GLOBAL_STORE_DWORDX4:
581 case AMDGPU::FLAT_STORE_DWORD:
582 case AMDGPU::FLAT_STORE_DWORDX2:
583 case AMDGPU::FLAT_STORE_DWORDX3:
584 case AMDGPU::FLAT_STORE_DWORDX4:
586 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
587 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
588 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
589 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
590 return GLOBAL_STORE_SADDR;
591 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
592 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
593 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
594 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
595 return FLAT_LOAD_SADDR;
596 case AMDGPU::FLAT_STORE_DWORD_SADDR:
597 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
598 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
599 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
600 return FLAT_STORE_SADDR;
615 return Info->BaseOpcode;
620 case AMDGPU::DS_READ_B32:
621 case AMDGPU::DS_READ_B32_gfx9:
622 case AMDGPU::DS_READ_B64:
623 case AMDGPU::DS_READ_B64_gfx9:
624 case AMDGPU::DS_WRITE_B32:
625 case AMDGPU::DS_WRITE_B32_gfx9:
626 case AMDGPU::DS_WRITE_B64:
627 case AMDGPU::DS_WRITE_B64_gfx9:
629 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
630 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
631 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
632 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
635 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
638 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
639 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
644 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
645 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
648 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
649 case AMDGPU::S_LOAD_DWORD_IMM:
650 case AMDGPU::S_LOAD_DWORDX2_IMM:
651 case AMDGPU::S_LOAD_DWORDX3_IMM:
652 case AMDGPU::S_LOAD_DWORDX4_IMM:
653 case AMDGPU::S_LOAD_DWORDX8_IMM:
654 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
655 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
656 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
657 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
658 return AMDGPU::S_LOAD_DWORD_IMM;
659 case AMDGPU::GLOBAL_LOAD_DWORD:
660 case AMDGPU::GLOBAL_LOAD_DWORDX2:
661 case AMDGPU::GLOBAL_LOAD_DWORDX3:
662 case AMDGPU::GLOBAL_LOAD_DWORDX4:
663 case AMDGPU::FLAT_LOAD_DWORD:
664 case AMDGPU::FLAT_LOAD_DWORDX2:
665 case AMDGPU::FLAT_LOAD_DWORDX3:
666 case AMDGPU::FLAT_LOAD_DWORDX4:
667 return AMDGPU::FLAT_LOAD_DWORD;
668 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
669 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
670 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
671 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
672 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
673 case AMDGPU::GLOBAL_STORE_DWORD:
674 case AMDGPU::GLOBAL_STORE_DWORDX2:
675 case AMDGPU::GLOBAL_STORE_DWORDX3:
676 case AMDGPU::GLOBAL_STORE_DWORDX4:
677 case AMDGPU::FLAT_STORE_DWORD:
678 case AMDGPU::FLAT_STORE_DWORDX2:
679 case AMDGPU::FLAT_STORE_DWORDX3:
680 case AMDGPU::FLAT_STORE_DWORDX4:
681 return AMDGPU::FLAT_STORE_DWORD;
682 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
683 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
684 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
685 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
686 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
687 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
688 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
689 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
690 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
691 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
692 case AMDGPU::FLAT_STORE_DWORD_SADDR:
693 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
694 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
695 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
696 return AMDGPU::FLAT_STORE_DWORD_SADDR;
707SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
708 const CombineInfo &Paired) {
709 assert(CI.InstClass == Paired.InstClass);
711 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
713 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
727 Result.SOffset =
true;
733 int VAddr0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0);
734 if (VAddr0Idx >= 0) {
735 AMDGPU::OpName RsrcName =
736 TII.isMIMG(
Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
737 int RsrcIdx = AMDGPU::getNamedOperandIdx(
Opc, RsrcName);
738 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
755 Result.SOffset =
true;
763 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
764 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
765 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
766 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
767 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
768 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
769 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
772 Result.SOffset =
true;
774 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
778 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
779 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
780 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
783 case AMDGPU::S_LOAD_DWORD_IMM:
784 case AMDGPU::S_LOAD_DWORDX2_IMM:
785 case AMDGPU::S_LOAD_DWORDX3_IMM:
786 case AMDGPU::S_LOAD_DWORDX4_IMM:
787 case AMDGPU::S_LOAD_DWORDX8_IMM:
788 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
789 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
790 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
791 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
794 case AMDGPU::DS_READ_B32:
795 case AMDGPU::DS_READ_B64:
796 case AMDGPU::DS_READ_B32_gfx9:
797 case AMDGPU::DS_READ_B64_gfx9:
798 case AMDGPU::DS_WRITE_B32:
799 case AMDGPU::DS_WRITE_B64:
800 case AMDGPU::DS_WRITE_B32_gfx9:
801 case AMDGPU::DS_WRITE_B64_gfx9:
804 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
805 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
806 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
807 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
808 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
809 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
810 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
811 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
812 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
813 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
814 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
815 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
816 case AMDGPU::FLAT_STORE_DWORD_SADDR:
817 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
818 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
819 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
822 case AMDGPU::GLOBAL_LOAD_DWORD:
823 case AMDGPU::GLOBAL_LOAD_DWORDX2:
824 case AMDGPU::GLOBAL_LOAD_DWORDX3:
825 case AMDGPU::GLOBAL_LOAD_DWORDX4:
826 case AMDGPU::GLOBAL_STORE_DWORD:
827 case AMDGPU::GLOBAL_STORE_DWORDX2:
828 case AMDGPU::GLOBAL_STORE_DWORDX3:
829 case AMDGPU::GLOBAL_STORE_DWORDX4:
830 case AMDGPU::FLAT_LOAD_DWORD:
831 case AMDGPU::FLAT_LOAD_DWORDX2:
832 case AMDGPU::FLAT_LOAD_DWORDX3:
833 case AMDGPU::FLAT_LOAD_DWORDX4:
834 case AMDGPU::FLAT_STORE_DWORD:
835 case AMDGPU::FLAT_STORE_DWORDX2:
836 case AMDGPU::FLAT_STORE_DWORDX3:
837 case AMDGPU::FLAT_STORE_DWORDX4:
844 const SILoadStoreOptimizer &LSO) {
846 unsigned Opc =
MI->getOpcode();
847 InstClass = getInstClass(
Opc, *LSO.TII);
849 if (InstClass == UNKNOWN)
852 DataRC = LSO.getDataRegClass(*
MI);
857 (
Opc == AMDGPU::DS_READ_B64 ||
Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
862 (
Opc == AMDGPU::DS_WRITE_B64 ||
Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
865 case S_BUFFER_LOAD_IMM:
866 case S_BUFFER_LOAD_SGPR_IMM:
875 if (InstClass == MIMG) {
876 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
880 int OffsetIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::offset);
881 Offset =
I->getOperand(OffsetIdx).getImm();
884 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
885 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
888 EltSize =
Info->BitsPerComp / 8;
891 Width = getOpcodeWidth(*
I, *LSO.TII);
893 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
895 }
else if (InstClass != MIMG) {
896 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
899 AddressRegs Regs = getRegs(
Opc, *LSO.TII);
900 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
903 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
904 AddrIdx[NumAddresses++] =
905 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0) + J;
907 AddrIdx[NumAddresses++] =
908 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::addr);
910 AddrIdx[NumAddresses++] =
911 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::sbase);
913 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
914 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::soffset);
919 AddrIdx[NumAddresses++] =
920 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
922 AddrIdx[NumAddresses++] =
923 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
925 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
926 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
927 assert(NumAddresses <= MaxAddressRegs);
929 for (
unsigned J = 0; J < NumAddresses; J++)
930 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
936 "SI Load Store Optimizer",
false,
false)
941char SILoadStoreOptimizerLegacy::
ID = 0;
946 return new SILoadStoreOptimizerLegacy();
952 for (
const auto &
Op :
MI.operands()) {
962bool SILoadStoreOptimizer::canSwapInstructions(
965 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
966 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
968 for (
const auto &BOp :
B.operands()) {
971 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
973 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
982SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
983 const CombineInfo &Paired) {
1003bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
1005 const CombineInfo &Paired) {
1006 assert(CI.InstClass == MIMG);
1009 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1010 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1012 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1016 AMDGPU::OpName OperandsToMatch[] = {
1017 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1018 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1020 for (AMDGPU::OpName
op : OperandsToMatch) {
1021 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
op);
1022 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),
op) !=
Idx)
1025 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
1030 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1031 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1037 if ((1u << AllowedBitsForMin) <= MinMask)
1044 unsigned ComponentCount,
1046 if (ComponentCount > 4)
1065 return NewFormatInfo->
Format;
1078bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1080 CombineInfo &Paired,
1082 assert(CI.InstClass != MIMG);
1086 if (CI.Offset == Paired.Offset)
1090 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1093 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1109 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1110 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1111 NumCombinedComponents = 4;
1119 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1120 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1121 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1122 ElemIndex1 + Paired.Width != ElemIndex0)
1128 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1129 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1130 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1131 if (MinOff % RequiredAlign != 0)
1137 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1138 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1143 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1144 if (EltOffset0 + CI.Width != EltOffset1 &&
1145 EltOffset1 + Paired.Width != EltOffset0)
1151 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1152 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1158 if (CI.Width != Paired.Width &&
1159 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1167 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1168 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1170 CI.Offset = EltOffset0 / 64;
1171 Paired.Offset = EltOffset1 / 64;
1178 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1180 CI.Offset = EltOffset0;
1181 Paired.Offset = EltOffset1;
1187 uint32_t Min = std::min(EltOffset0, EltOffset1);
1190 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1191 if (((Max - Min) & ~Mask) == 0) {
1199 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1200 CI.BaseOff = BaseOff * CI.EltSize;
1201 CI.Offset = (EltOffset0 - BaseOff) / 64;
1202 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1208 if (isUInt<8>(Max - Min)) {
1214 CI.BaseOff = BaseOff * CI.EltSize;
1215 CI.Offset = EltOffset0 - BaseOff;
1216 Paired.Offset = EltOffset1 - BaseOff;
1224bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1225 const CombineInfo &CI,
1226 const CombineInfo &Paired) {
1227 const unsigned Width = (CI.Width + Paired.Width);
1228 switch (CI.InstClass) {
1231 case S_BUFFER_LOAD_IMM:
1232 case S_BUFFER_LOAD_SGPR_IMM:
1248SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1249 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1250 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1252 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1253 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1255 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1256 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1258 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1259 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1261 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1262 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1269SILoadStoreOptimizer::CombineInfo *
1270SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1271 CombineInfo &Paired) {
1274 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1276 assert(CI.InstClass == Paired.InstClass);
1278 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1279 getInstSubclass(Paired.I->getOpcode(), *
TII))
1284 if (CI.InstClass == MIMG) {
1285 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1288 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1295 if (CI.I->mayLoad()) {
1299 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1307 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1317 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1318 offsetsCanBeCombined(CI, *STM, Paired,
true);
1320 if (CI.InstClass == DS_WRITE) {
1329 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1331 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1334 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1335 AMDGPU::OpName::data0);
1336 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1337 AMDGPU::OpName::data1);
1340 TII->getRegClass(Write2Opc, Data0Idx,
TRI, *MF);
1343 TII->getRegClass(Write2Opc, Data1Idx,
TRI, *MF);
1346 DataRC0 =
TRI->getMatchingSuperRegClass(
MRI->getRegClass(Data0->
getReg()),
1351 DataRC1 =
TRI->getMatchingSuperRegClass(
MRI->getRegClass(Data1->
getReg()),
1355 if (!
MRI->constrainRegClass(Data0->
getReg(), DataRC0) ||
1356 !
MRI->constrainRegClass(Data1->
getReg(), DataRC1))
1368void SILoadStoreOptimizer::copyToDestRegs(
1369 CombineInfo &CI, CombineInfo &Paired,
1375 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1379 auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1380 auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1385 Dest0->setIsEarlyClobber(
false);
1386 Dest1->setIsEarlyClobber(
false);
1390 .
addReg(DestReg, 0, SubRegIdx0);
1399SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1401 AMDGPU::OpName
OpName)
const {
1405 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1409 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1411 const auto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1412 const auto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1414 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1423unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1425 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1426 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1429unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1431 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1433 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1434 : AMDGPU::DS_READ2ST64_B64_gfx9;
1438SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1444 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1446 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1447 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1449 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1451 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1452 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1457 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1462 unsigned BaseSubReg = AddrReg->getSubReg();
1463 unsigned BaseRegFlags = 0;
1465 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1469 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1472 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1474 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1481 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1487 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1489 CI.I->eraseFromParent();
1490 Paired.I->eraseFromParent();
1496unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1498 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1499 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1500 : AMDGPU::DS_WRITE2_B64_gfx9;
1503unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1505 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1506 : AMDGPU::DS_WRITE2ST64_B64;
1508 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1509 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1512unsigned SILoadStoreOptimizer::getWrite2Opcode(
const CombineInfo &CI)
const {
1513 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1517 CombineInfo &CI, CombineInfo &Paired,
1524 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1526 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1528 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1530 unsigned NewOffset0 = CI.Offset;
1531 unsigned NewOffset1 = Paired.Offset;
1532 unsigned Opc = getWrite2Opcode(CI);
1534 if (NewOffset0 > NewOffset1) {
1540 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1541 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1547 unsigned BaseSubReg = AddrReg->
getSubReg();
1548 unsigned BaseRegFlags = 0;
1550 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1554 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1557 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1559 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1566 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1575 Paired.I->eraseFromParent();
1577 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1582SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1586 const unsigned Opcode = getNewOpcode(CI, Paired);
1590 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1591 unsigned MergedDMask = CI.DMask | Paired.DMask;
1593 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1595 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1596 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1598 MIB.addImm(MergedDMask);
1600 MIB.add((*CI.I).getOperand(
I));
1606 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1608 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1610 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1612 CI.I->eraseFromParent();
1613 Paired.I->eraseFromParent();
1618 CombineInfo &CI, CombineInfo &Paired,
1622 const unsigned Opcode = getNewOpcode(CI, Paired);
1626 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1627 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1632 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1636 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1637 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1638 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1639 New.addImm(MergedOffset);
1640 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1642 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1644 CI.I->eraseFromParent();
1645 Paired.I->eraseFromParent();
1650 CombineInfo &CI, CombineInfo &Paired,
1655 const unsigned Opcode = getNewOpcode(CI, Paired);
1660 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1661 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1663 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1665 AddressRegs Regs = getRegs(Opcode, *
TII);
1668 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1673 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1676 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1677 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1678 .addImm(MergedOffset)
1681 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1683 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1685 CI.I->eraseFromParent();
1686 Paired.I->eraseFromParent();
1691 CombineInfo &CI, CombineInfo &Paired,
1696 const unsigned Opcode = getNewOpcode(CI, Paired);
1701 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1702 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1704 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1706 AddressRegs Regs = getRegs(Opcode, *
TII);
1709 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1714 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1715 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1716 NumCombinedComponents = 4;
1717 unsigned JoinedFormat =
1723 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1726 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1727 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1728 .addImm(MergedOffset)
1729 .addImm(JoinedFormat)
1732 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1734 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1736 CI.I->eraseFromParent();
1737 Paired.I->eraseFromParent();
1742 CombineInfo &CI, CombineInfo &Paired,
1747 const unsigned Opcode = getNewOpcode(CI, Paired);
1750 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1755 AddressRegs Regs = getRegs(Opcode, *
TII);
1758 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1763 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1764 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1765 NumCombinedComponents = 4;
1766 unsigned JoinedFormat =
1772 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1775 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1776 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1777 .addImm(std::min(CI.Offset, Paired.Offset))
1778 .addImm(JoinedFormat)
1781 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1783 CI.I->eraseFromParent();
1784 Paired.I->eraseFromParent();
1789 CombineInfo &CI, CombineInfo &Paired,
1794 const unsigned Opcode = getNewOpcode(CI, Paired);
1797 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1799 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1801 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1805 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1806 .addImm(std::min(CI.Offset, Paired.Offset))
1808 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1810 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1812 CI.I->eraseFromParent();
1813 Paired.I->eraseFromParent();
1818 CombineInfo &CI, CombineInfo &Paired,
1823 const unsigned Opcode = getNewOpcode(CI, Paired);
1826 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1829 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1832 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1836 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1838 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1840 CI.I->eraseFromParent();
1841 Paired.I->eraseFromParent();
1850 (MMOs.
size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1853unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1854 const CombineInfo &Paired) {
1855 const unsigned Width = CI.Width + Paired.Width;
1857 switch (getCommonInstClass(CI, Paired)) {
1859 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1870 case S_BUFFER_LOAD_IMM: {
1873 bool NeedsConstrainedOpc =
1879 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1880 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1882 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1883 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1885 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1886 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1888 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1889 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1892 case S_BUFFER_LOAD_SGPR_IMM: {
1895 bool NeedsConstrainedOpc =
1901 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1902 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1904 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1905 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1907 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1908 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1910 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1911 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1917 bool NeedsConstrainedOpc =
1923 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1924 : AMDGPU::S_LOAD_DWORDX2_IMM;
1926 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1927 : AMDGPU::S_LOAD_DWORDX3_IMM;
1929 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1930 : AMDGPU::S_LOAD_DWORDX4_IMM;
1932 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1933 : AMDGPU::S_LOAD_DWORDX8_IMM;
1941 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1943 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1945 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1947 case GLOBAL_LOAD_SADDR:
1952 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1954 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1956 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1963 return AMDGPU::GLOBAL_STORE_DWORDX2;
1965 return AMDGPU::GLOBAL_STORE_DWORDX3;
1967 return AMDGPU::GLOBAL_STORE_DWORDX4;
1969 case GLOBAL_STORE_SADDR:
1974 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1976 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1978 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1985 return AMDGPU::FLAT_LOAD_DWORDX2;
1987 return AMDGPU::FLAT_LOAD_DWORDX3;
1989 return AMDGPU::FLAT_LOAD_DWORDX4;
1996 return AMDGPU::FLAT_STORE_DWORDX2;
1998 return AMDGPU::FLAT_STORE_DWORDX3;
2000 return AMDGPU::FLAT_STORE_DWORDX4;
2002 case FLAT_LOAD_SADDR:
2007 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2009 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2011 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2013 case FLAT_STORE_SADDR:
2018 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2020 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2022 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2031std::pair<unsigned, unsigned>
2032SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
2033 const CombineInfo &Paired) {
2034 assert((CI.InstClass != MIMG ||
2036 CI.Width + Paired.Width)) &&
2042 static const unsigned Idxs[5][4] = {
2043 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2044 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2045 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2046 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2047 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2050 assert(CI.Width >= 1 && CI.Width <= 4);
2051 assert(Paired.Width >= 1 && Paired.Width <= 4);
2054 Idx1 = Idxs[0][Paired.Width - 1];
2055 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2057 Idx0 = Idxs[0][CI.Width - 1];
2058 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2061 return {Idx0, Idx1};
2065SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
2066 const CombineInfo &Paired)
const {
2067 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2068 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2069 switch (CI.Width + Paired.Width) {
2073 return &AMDGPU::SReg_64_XEXECRegClass;
2075 return &AMDGPU::SGPR_96RegClass;
2077 return &AMDGPU::SGPR_128RegClass;
2079 return &AMDGPU::SGPR_256RegClass;
2081 return &AMDGPU::SGPR_512RegClass;
2087 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2088 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2094 CombineInfo &CI, CombineInfo &Paired,
2099 const unsigned Opcode = getNewOpcode(CI, Paired);
2102 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
2107 AddressRegs Regs = getRegs(Opcode, *
TII);
2110 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2116 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2119 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2120 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2121 .addImm(std::min(CI.Offset, Paired.Offset))
2124 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2126 CI.I->eraseFromParent();
2127 Paired.I->eraseFromParent();
2132SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
2134 if (
TII->isInlineConstant(V))
2137 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2139 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
2140 TII->get(AMDGPU::S_MOV_B32), Reg)
2149 const MemAddress &
Addr)
const {
2155 Addr.Base.LoSubReg) &&
2156 "Expected 32-bit Base-Register-Low!!");
2159 Addr.Base.HiSubReg) &&
2160 "Expected 32-bit Base-Register-Hi!!");
2165 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
2167 const auto *CarryRC =
TRI->getWaveMaskRegClass();
2168 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2169 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2171 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2172 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2192 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2208 int32_t NewOffset)
const {
2209 auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2210 Base->setReg(NewBase);
2211 Base->setIsKill(
false);
2212 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2215std::optional<int32_t>
2221 return std::nullopt;
2224 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2225 !
Def->getOperand(1).isImm())
2226 return std::nullopt;
2228 return Def->getOperand(1).getImm();
2242 MemAddress &
Addr)
const {
2247 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2248 ||
Def->getNumOperands() != 5)
2259 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2260 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2263 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2264 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2266 auto Offset0P = extractConstOffset(*Src0);
2270 if (!(Offset0P = extractConstOffset(*Src1)))
2275 if (!BaseLo.
isReg())
2278 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2279 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2284 if (!Src1->isImm() || Src0->isImm())
2290 if (!BaseHi.
isReg())
2297 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2300bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2302 MemInfoMap &Visited,
2320 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2327 auto [It,
Inserted] = Visited.try_emplace(&
MI);
2330 processBaseWithConstOffset(
Base, MAddr);
2335 if (MAddr.Offset == 0) {
2336 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2337 " constant offsets that can be promoted.\n";);
2343 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2371 MemAddress AnchorAddr;
2372 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2386 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2390 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2391 MemAddress MAddrNext;
2392 auto [It,
Inserted] = Visited.try_emplace(&MINext);
2394 processBaseWithConstOffset(BaseNext, MAddrNext);
2395 It->second = MAddrNext;
2397 MAddrNext = It->second;
2399 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2400 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2401 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2402 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2405 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2407 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2412 (
uint32_t)std::abs(Dist) > MaxDist) {
2413 MaxDist = std::abs(Dist);
2415 AnchorAddr = MAddrNext;
2416 AnchorInst = &MINext;
2421 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2422 AnchorInst->
dump());
2424 << AnchorAddr.Offset <<
"\n\n");
2429 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2432 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2435 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2440 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2451void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2452 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2453 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2454 if (AddrList.front().InstClass == CI.InstClass &&
2455 AddrList.front().hasSameBaseAddress(CI)) {
2456 AddrList.emplace_back(CI);
2462 MergeableInsts.emplace_back(1, CI);
2465std::pair<MachineBasicBlock::iterator, bool>
2466SILoadStoreOptimizer::collectMergeableInsts(
2469 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2475 for (; BlockI !=
End; ++BlockI) {
2480 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2485 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2493 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2494 if (InstClass == UNKNOWN)
2499 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::swz);
2500 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2503 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2505 TII->getNamedOperand(
MI, AMDGPU::OpName::format);
2513 CI.setMI(
MI, *
this);
2516 if (!CI.hasMergeableAddress(*
MRI))
2531 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2532 E = MergeableInsts.end();
I != E;) {
2534 std::list<CombineInfo> &MergeList = *
I;
2535 if (MergeList.size() <= 1) {
2539 I = MergeableInsts.erase(
I);
2547 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2548 return A.Offset <
B.Offset;
2559bool SILoadStoreOptimizer::optimizeBlock(
2560 std::list<std::list<CombineInfo> > &MergeableInsts) {
2563 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2564 E = MergeableInsts.end();
I != E;) {
2565 std::list<CombineInfo> &MergeList = *
I;
2567 bool OptimizeListAgain =
false;
2568 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2572 I = MergeableInsts.erase(
I);
2580 if (!OptimizeListAgain) {
2581 I = MergeableInsts.erase(
I);
2584 OptimizeAgain =
true;
2590SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2591 std::list<CombineInfo> &MergeList,
2592 bool &OptimizeListAgain) {
2593 if (MergeList.empty())
2598 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2599 Next = std::next(
I)) {
2604 if ((*First).Order > (*Second).Order)
2606 CombineInfo &CI = *
First;
2607 CombineInfo &Paired = *Second;
2609 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2617 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2620 switch (CI.InstClass) {
2625 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2628 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2630 case S_BUFFER_LOAD_IMM:
2631 case S_BUFFER_LOAD_SGPR_IMM:
2633 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2634 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2637 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2638 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2641 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2642 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2645 NewMI = mergeImagePair(CI, Paired, Where->I);
2646 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2649 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2650 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2653 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2654 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2657 case FLAT_LOAD_SADDR:
2659 case GLOBAL_LOAD_SADDR:
2660 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2661 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2664 case FLAT_STORE_SADDR:
2666 case GLOBAL_STORE_SADDR:
2667 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2668 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2671 CI.setMI(NewMI, *
this);
2672 CI.Order = Where->Order;
2676 MergeList.erase(Second);
2682bool SILoadStoreOptimizerLegacy::runOnMachineFunction(
MachineFunction &MF) {
2685 return SILoadStoreOptimizer(
2686 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2697 TRI = &
TII->getRegisterInfo();
2714 bool CollectModified;
2715 std::list<std::list<CombineInfo>> MergeableInsts;
2719 std::tie(SectionEnd, CollectModified) =
2725 OptimizeAgain =
false;
2727 }
while (OptimizeAgain);
2749 bool Changed = SILoadStoreOptimizer(&AA).
run(MF);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A manager for alias analyses.
LLVM_ABI Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
A private abstract base class describing the concept of an individual alias analysis implementation.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...