54#define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
56STATISTIC(NumStridedLoadsMarked,
"Number of strided loads marked");
58 "Number of HW prefetch tag collisions avoided");
60 "Number of HW prefetch tag collisions not avoided due to lack of registers");
62 "Controls which tag collisions are avoided");
66class FalkorMarkStridedAccesses {
74 bool runOnLoop(
Loop &L);
80class FalkorMarkStridedAccessesLegacy :
public FunctionPass {
100char FalkorMarkStridedAccessesLegacy::ID = 0;
103 "Falkor HW Prefetch Fix",
false,
false)
111 return new FalkorMarkStridedAccessesLegacy();
114bool FalkorMarkStridedAccessesLegacy::runOnFunction(
Function &
F) {
118 if (ST->getProcFamily() != AArch64Subtarget::Falkor)
124 LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
125 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
127 FalkorMarkStridedAccesses LDP(LI, SE);
131bool FalkorMarkStridedAccesses::run() {
132 bool MadeChange =
false;
136 MadeChange |= runOnLoop(*LIt);
141bool FalkorMarkStridedAccesses::runOnLoop(
Loop &L) {
143 if (!
L.isInnermost())
146 bool MadeChange =
false;
150 LoadInst *LoadI = dyn_cast<LoadInst>(&
I);
155 if (
L.isLoopInvariant(PtrValue))
159 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
160 if (!LSCEVAddRec || !LSCEVAddRec->
isAffine())
165 ++NumStridedLoadsMarked;
211 bool IsPrePost =
false;
216char FalkorHWPFFix::ID = 0;
219 "Falkor HW Prefetch Fix Late Phase",
false,
false)
225 return (Dest & 0xf) | ((
Base & 0xf) << 4) | ((
Offset & 0x3f) << 8);
234 switch (
MI.getOpcode()) {
238 case AArch64::LD1i64:
239 case AArch64::LD2i64:
247 case AArch64::LD1i16:
248 case AArch64::LD1i32:
250 case AArch64::LD2i16:
251 case AArch64::LD2i32:
253 case AArch64::LD3i16:
254 case AArch64::LD3i32:
255 case AArch64::LD3i64:
257 case AArch64::LD4i16:
258 case AArch64::LD4i32:
259 case AArch64::LD4i64:
266 case AArch64::LD1Onev1d:
267 case AArch64::LD1Onev2s:
268 case AArch64::LD1Onev4h:
269 case AArch64::LD1Onev8b:
270 case AArch64::LD1Onev2d:
271 case AArch64::LD1Onev4s:
272 case AArch64::LD1Onev8h:
273 case AArch64::LD1Onev16b:
274 case AArch64::LD1Rv1d:
275 case AArch64::LD1Rv2s:
276 case AArch64::LD1Rv4h:
277 case AArch64::LD1Rv8b:
278 case AArch64::LD1Rv2d:
279 case AArch64::LD1Rv4s:
280 case AArch64::LD1Rv8h:
281 case AArch64::LD1Rv16b:
288 case AArch64::LD1Twov1d:
289 case AArch64::LD1Twov2s:
290 case AArch64::LD1Twov4h:
291 case AArch64::LD1Twov8b:
292 case AArch64::LD1Twov2d:
293 case AArch64::LD1Twov4s:
294 case AArch64::LD1Twov8h:
295 case AArch64::LD1Twov16b:
296 case AArch64::LD1Threev1d:
297 case AArch64::LD1Threev2s:
298 case AArch64::LD1Threev4h:
299 case AArch64::LD1Threev8b:
300 case AArch64::LD1Threev2d:
301 case AArch64::LD1Threev4s:
302 case AArch64::LD1Threev8h:
303 case AArch64::LD1Threev16b:
304 case AArch64::LD1Fourv1d:
305 case AArch64::LD1Fourv2s:
306 case AArch64::LD1Fourv4h:
307 case AArch64::LD1Fourv8b:
308 case AArch64::LD1Fourv2d:
309 case AArch64::LD1Fourv4s:
310 case AArch64::LD1Fourv8h:
311 case AArch64::LD1Fourv16b:
312 case AArch64::LD2Twov2s:
313 case AArch64::LD2Twov4s:
314 case AArch64::LD2Twov8b:
315 case AArch64::LD2Twov2d:
316 case AArch64::LD2Twov4h:
317 case AArch64::LD2Twov8h:
318 case AArch64::LD2Twov16b:
319 case AArch64::LD2Rv1d:
320 case AArch64::LD2Rv2s:
321 case AArch64::LD2Rv4s:
322 case AArch64::LD2Rv8b:
323 case AArch64::LD2Rv2d:
324 case AArch64::LD2Rv4h:
325 case AArch64::LD2Rv8h:
326 case AArch64::LD2Rv16b:
327 case AArch64::LD3Threev2s:
328 case AArch64::LD3Threev4h:
329 case AArch64::LD3Threev8b:
330 case AArch64::LD3Threev2d:
331 case AArch64::LD3Threev4s:
332 case AArch64::LD3Threev8h:
333 case AArch64::LD3Threev16b:
334 case AArch64::LD3Rv1d:
335 case AArch64::LD3Rv2s:
336 case AArch64::LD3Rv4h:
337 case AArch64::LD3Rv8b:
338 case AArch64::LD3Rv2d:
339 case AArch64::LD3Rv4s:
340 case AArch64::LD3Rv8h:
341 case AArch64::LD3Rv16b:
342 case AArch64::LD4Fourv2s:
343 case AArch64::LD4Fourv4h:
344 case AArch64::LD4Fourv8b:
345 case AArch64::LD4Fourv2d:
346 case AArch64::LD4Fourv4s:
347 case AArch64::LD4Fourv8h:
348 case AArch64::LD4Fourv16b:
349 case AArch64::LD4Rv1d:
350 case AArch64::LD4Rv2s:
351 case AArch64::LD4Rv4h:
352 case AArch64::LD4Rv8b:
353 case AArch64::LD4Rv2d:
354 case AArch64::LD4Rv4s:
355 case AArch64::LD4Rv8h:
356 case AArch64::LD4Rv16b:
363 case AArch64::LD1i64_POST:
364 case AArch64::LD2i64_POST:
371 case AArch64::LD1i8_POST:
372 case AArch64::LD1i16_POST:
373 case AArch64::LD1i32_POST:
374 case AArch64::LD2i8_POST:
375 case AArch64::LD2i16_POST:
376 case AArch64::LD2i32_POST:
377 case AArch64::LD3i8_POST:
378 case AArch64::LD3i16_POST:
379 case AArch64::LD3i32_POST:
380 case AArch64::LD3i64_POST:
381 case AArch64::LD4i8_POST:
382 case AArch64::LD4i16_POST:
383 case AArch64::LD4i32_POST:
384 case AArch64::LD4i64_POST:
391 case AArch64::LD1Onev1d_POST:
392 case AArch64::LD1Onev2s_POST:
393 case AArch64::LD1Onev4h_POST:
394 case AArch64::LD1Onev8b_POST:
395 case AArch64::LD1Onev2d_POST:
396 case AArch64::LD1Onev4s_POST:
397 case AArch64::LD1Onev8h_POST:
398 case AArch64::LD1Onev16b_POST:
399 case AArch64::LD1Rv1d_POST:
400 case AArch64::LD1Rv2s_POST:
401 case AArch64::LD1Rv4h_POST:
402 case AArch64::LD1Rv8b_POST:
403 case AArch64::LD1Rv2d_POST:
404 case AArch64::LD1Rv4s_POST:
405 case AArch64::LD1Rv8h_POST:
406 case AArch64::LD1Rv16b_POST:
413 case AArch64::LD1Twov1d_POST:
414 case AArch64::LD1Twov2s_POST:
415 case AArch64::LD1Twov4h_POST:
416 case AArch64::LD1Twov8b_POST:
417 case AArch64::LD1Twov2d_POST:
418 case AArch64::LD1Twov4s_POST:
419 case AArch64::LD1Twov8h_POST:
420 case AArch64::LD1Twov16b_POST:
421 case AArch64::LD1Threev1d_POST:
422 case AArch64::LD1Threev2s_POST:
423 case AArch64::LD1Threev4h_POST:
424 case AArch64::LD1Threev8b_POST:
425 case AArch64::LD1Threev2d_POST:
426 case AArch64::LD1Threev4s_POST:
427 case AArch64::LD1Threev8h_POST:
428 case AArch64::LD1Threev16b_POST:
429 case AArch64::LD1Fourv1d_POST:
430 case AArch64::LD1Fourv2s_POST:
431 case AArch64::LD1Fourv4h_POST:
432 case AArch64::LD1Fourv8b_POST:
433 case AArch64::LD1Fourv2d_POST:
434 case AArch64::LD1Fourv4s_POST:
435 case AArch64::LD1Fourv8h_POST:
436 case AArch64::LD1Fourv16b_POST:
437 case AArch64::LD2Twov2s_POST:
438 case AArch64::LD2Twov4s_POST:
439 case AArch64::LD2Twov8b_POST:
440 case AArch64::LD2Twov2d_POST:
441 case AArch64::LD2Twov4h_POST:
442 case AArch64::LD2Twov8h_POST:
443 case AArch64::LD2Twov16b_POST:
444 case AArch64::LD2Rv1d_POST:
445 case AArch64::LD2Rv2s_POST:
446 case AArch64::LD2Rv4s_POST:
447 case AArch64::LD2Rv8b_POST:
448 case AArch64::LD2Rv2d_POST:
449 case AArch64::LD2Rv4h_POST:
450 case AArch64::LD2Rv8h_POST:
451 case AArch64::LD2Rv16b_POST:
452 case AArch64::LD3Threev2s_POST:
453 case AArch64::LD3Threev4h_POST:
454 case AArch64::LD3Threev8b_POST:
455 case AArch64::LD3Threev2d_POST:
456 case AArch64::LD3Threev4s_POST:
457 case AArch64::LD3Threev8h_POST:
458 case AArch64::LD3Threev16b_POST:
459 case AArch64::LD3Rv1d_POST:
460 case AArch64::LD3Rv2s_POST:
461 case AArch64::LD3Rv4h_POST:
462 case AArch64::LD3Rv8b_POST:
463 case AArch64::LD3Rv2d_POST:
464 case AArch64::LD3Rv4s_POST:
465 case AArch64::LD3Rv8h_POST:
466 case AArch64::LD3Rv16b_POST:
467 case AArch64::LD4Fourv2s_POST:
468 case AArch64::LD4Fourv4h_POST:
469 case AArch64::LD4Fourv8b_POST:
470 case AArch64::LD4Fourv2d_POST:
471 case AArch64::LD4Fourv4s_POST:
472 case AArch64::LD4Fourv8h_POST:
473 case AArch64::LD4Fourv16b_POST:
474 case AArch64::LD4Rv1d_POST:
475 case AArch64::LD4Rv2s_POST:
476 case AArch64::LD4Rv4h_POST:
477 case AArch64::LD4Rv8b_POST:
478 case AArch64::LD4Rv2d_POST:
479 case AArch64::LD4Rv4s_POST:
480 case AArch64::LD4Rv8h_POST:
481 case AArch64::LD4Rv16b_POST:
488 case AArch64::LDRBBroW:
489 case AArch64::LDRBBroX:
490 case AArch64::LDRBBui:
491 case AArch64::LDRBroW:
492 case AArch64::LDRBroX:
493 case AArch64::LDRBui:
495 case AArch64::LDRDroW:
496 case AArch64::LDRDroX:
497 case AArch64::LDRDui:
498 case AArch64::LDRHHroW:
499 case AArch64::LDRHHroX:
500 case AArch64::LDRHHui:
501 case AArch64::LDRHroW:
502 case AArch64::LDRHroX:
503 case AArch64::LDRHui:
505 case AArch64::LDRQroW:
506 case AArch64::LDRQroX:
507 case AArch64::LDRQui:
508 case AArch64::LDRSBWroW:
509 case AArch64::LDRSBWroX:
510 case AArch64::LDRSBWui:
511 case AArch64::LDRSBXroW:
512 case AArch64::LDRSBXroX:
513 case AArch64::LDRSBXui:
514 case AArch64::LDRSHWroW:
515 case AArch64::LDRSHWroX:
516 case AArch64::LDRSHWui:
517 case AArch64::LDRSHXroW:
518 case AArch64::LDRSHXroX:
519 case AArch64::LDRSHXui:
520 case AArch64::LDRSWl:
521 case AArch64::LDRSWroW:
522 case AArch64::LDRSWroX:
523 case AArch64::LDRSWui:
525 case AArch64::LDRSroW:
526 case AArch64::LDRSroX:
527 case AArch64::LDRSui:
529 case AArch64::LDRWroW:
530 case AArch64::LDRWroX:
531 case AArch64::LDRWui:
533 case AArch64::LDRXroW:
534 case AArch64::LDRXroX:
535 case AArch64::LDRXui:
536 case AArch64::LDURBBi:
537 case AArch64::LDURBi:
538 case AArch64::LDURDi:
539 case AArch64::LDURHHi:
540 case AArch64::LDURHi:
541 case AArch64::LDURQi:
542 case AArch64::LDURSBWi:
543 case AArch64::LDURSBXi:
544 case AArch64::LDURSHWi:
545 case AArch64::LDURSHXi:
546 case AArch64::LDURSWi:
547 case AArch64::LDURSi:
548 case AArch64::LDURWi:
549 case AArch64::LDURXi:
556 case AArch64::LDRBBpost:
557 case AArch64::LDRBBpre:
558 case AArch64::LDRBpost:
559 case AArch64::LDRBpre:
560 case AArch64::LDRDpost:
561 case AArch64::LDRDpre:
562 case AArch64::LDRHHpost:
563 case AArch64::LDRHHpre:
564 case AArch64::LDRHpost:
565 case AArch64::LDRHpre:
566 case AArch64::LDRQpost:
567 case AArch64::LDRQpre:
568 case AArch64::LDRSBWpost:
569 case AArch64::LDRSBWpre:
570 case AArch64::LDRSBXpost:
571 case AArch64::LDRSBXpre:
572 case AArch64::LDRSHWpost:
573 case AArch64::LDRSHWpre:
574 case AArch64::LDRSHXpost:
575 case AArch64::LDRSHXpre:
576 case AArch64::LDRSWpost:
577 case AArch64::LDRSWpre:
578 case AArch64::LDRSpost:
579 case AArch64::LDRSpre:
580 case AArch64::LDRWpost:
581 case AArch64::LDRWpre:
582 case AArch64::LDRXpost:
583 case AArch64::LDRXpre:
590 case AArch64::LDNPDi:
591 case AArch64::LDNPQi:
592 case AArch64::LDNPSi:
602 case AArch64::LDPSWi:
611 case AArch64::LDPQpost:
612 case AArch64::LDPQpre:
613 case AArch64::LDPDpost:
614 case AArch64::LDPDpre:
615 case AArch64::LDPSpost:
616 case AArch64::LDPSpre:
623 case AArch64::LDPSWpost:
624 case AArch64::LDPSWpre:
625 case AArch64::LDPWpost:
626 case AArch64::LDPWpre:
627 case AArch64::LDPXpost:
628 case AArch64::LDPXpre:
637 Register BaseReg =
MI.getOperand(BaseRegIdx).getReg();
638 if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
642 LI.DestReg = DestRegIdx == -1 ?
Register() :
MI.getOperand(DestRegIdx).getReg();
643 LI.BaseReg = BaseReg;
644 LI.BaseRegIdx = BaseRegIdx;
645 LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &
MI.getOperand(OffsetIdx);
646 LI.IsPrePost = IsPrePost;
653 unsigned Dest = LI.DestReg ?
TRI->getEncodingValue(LI.DestReg) : 0;
654 unsigned Base =
TRI->getEncodingValue(LI.BaseReg);
656 if (LI.OffsetOpnd ==
nullptr)
658 else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
659 LI.OffsetOpnd->isCPI())
661 else if (LI.OffsetOpnd->isReg())
662 Off = (1 << 5) |
TRI->getEncodingValue(LI.OffsetOpnd->getReg());
664 Off = LI.OffsetOpnd->getImm() >> 2;
680 TagMap[*
Tag].push_back(&
MI);
683 bool AnyCollisions =
false;
684 for (
auto &
P : TagMap) {
685 auto Size =
P.second.size();
687 for (
auto *
MI :
P.second) {
688 if (
TII->isStridedAccess(*
MI)) {
689 AnyCollisions =
true;
708 LR.addLiveOuts(*
MBB);
711 if (!
TII->isStridedAccess(
MI))
718 std::optional<unsigned> OptOldTag =
getTag(
TRI,
MI, LdI);
721 auto &OldCollisions = TagMap[*OptOldTag];
722 if (OldCollisions.size() <= 1)
735 for (
unsigned OpI = 0, OpE =
MI.getNumOperands(); OpI < OpE; ++OpI) {
736 if (OpI ==
static_cast<unsigned>(LdI.BaseRegIdx))
743 for (
unsigned ScratchReg : AArch64::GPR64RegClass) {
744 if (!LR.available(ScratchReg) ||
MRI.isReserved(ScratchReg))
748 NewLdI.BaseReg = ScratchReg;
751 if (TagMap.count(NewTag))
768 BaseOpnd.
setReg(ScratchReg);
775 MI.getOperand(0).setReg(
778 TII->get(AArch64::ORRXrs), LdI.BaseReg)
784 for (
int I = 0, E = OldCollisions.size();
I != E; ++
I)
785 if (OldCollisions[
I] == &
MI) {
786 std::swap(OldCollisions[
I], OldCollisions[E - 1]);
787 OldCollisions.pop_back();
795 TagMap[NewTag].push_back(&
MI);
796 ++NumCollisionsAvoided;
802 ++NumCollisionsNotAvoided;
809 if (
ST.getProcFamily() != AArch64Subtarget::Falkor)
815 TII =
ST.getInstrInfo();
816 TRI =
ST.getRegisterInfo();
818 MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
825 if (
L->isInnermost())
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late static false unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset)
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
aarch64 falkor hwpf fix late
#define FALKOR_STRIDED_ACCESS_MD
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Target-Independent Code Generator Pass Configuration Options pass.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
LLVM Basic Block Representation.
static bool shouldExecute(unsigned CounterName)
Legacy analysis pass which computes a DominatorTree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
A set of register units used to track register liveness.
An instruction for reading from memory.
Value * getPointerOperand()
The legacy pass manager's analysis pass to compute loop information.
Represents a single loop in the control flow graph.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Wrapper class representing virtual and physical registers.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
LLVM Value Representation.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createFalkorMarkStridedAccessesPass()
FunctionPass * createFalkorHWPFFixPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
A record for a potential prefetch made during the initial scan of the loop.